xref: /llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (revision 4f33a68973bfd9fa429b57528c3fe5443f59a734)
1 //===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements a simple loop unroller.  It works best when loops have
10 // been canonicalized by the -indvars pass, allowing it to determine the trip
11 // counts of loops easily.
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseMapInfo.h"
17 #include "llvm/ADT/DenseSet.h"
18 #include "llvm/ADT/None.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SetVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/Analysis/AssumptionCache.h"
26 #include "llvm/Analysis/BlockFrequencyInfo.h"
27 #include "llvm/Analysis/CodeMetrics.h"
28 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
29 #include "llvm/Analysis/LoopAnalysisManager.h"
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/Analysis/LoopPass.h"
32 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
33 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
34 #include "llvm/Analysis/ProfileSummaryInfo.h"
35 #include "llvm/Analysis/ScalarEvolution.h"
36 #include "llvm/Analysis/TargetTransformInfo.h"
37 #include "llvm/IR/BasicBlock.h"
38 #include "llvm/IR/CFG.h"
39 #include "llvm/IR/Constant.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DiagnosticInfo.h"
42 #include "llvm/IR/Dominators.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/Instruction.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/IR/Metadata.h"
48 #include "llvm/IR/PassManager.h"
49 #include "llvm/InitializePasses.h"
50 #include "llvm/Pass.h"
51 #include "llvm/Support/Casting.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/raw_ostream.h"
56 #include "llvm/Transforms/Scalar.h"
57 #include "llvm/Transforms/Scalar/LoopPassManager.h"
58 #include "llvm/Transforms/Utils.h"
59 #include "llvm/Transforms/Utils/LoopSimplify.h"
60 #include "llvm/Transforms/Utils/LoopUtils.h"
61 #include "llvm/Transforms/Utils/SizeOpts.h"
62 #include "llvm/Transforms/Utils/UnrollLoop.h"
63 #include <algorithm>
64 #include <cassert>
65 #include <cstdint>
66 #include <limits>
67 #include <string>
68 #include <tuple>
69 #include <utility>
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "loop-unroll"
74 
75 cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
76     "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
77     cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
78              " the current top-most loop. This is somtimes preferred to reduce"
79              " compile time."));
80 
81 static cl::opt<unsigned>
82     UnrollThreshold("unroll-threshold", cl::Hidden,
83                     cl::desc("The cost threshold for loop unrolling"));
84 
85 static cl::opt<unsigned> UnrollPartialThreshold(
86     "unroll-partial-threshold", cl::Hidden,
87     cl::desc("The cost threshold for partial loop unrolling"));
88 
89 static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
90     "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
91     cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
92              "to the threshold when aggressively unrolling a loop due to the "
93              "dynamic cost savings. If completely unrolling a loop will reduce "
94              "the total runtime from X to Y, we boost the loop unroll "
95              "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
96              "X/Y). This limit avoids excessive code bloat."));
97 
98 static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
99     "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
100     cl::desc("Don't allow loop unrolling to simulate more than this number of"
101              "iterations when checking full unroll profitability"));
102 
103 static cl::opt<unsigned> UnrollCount(
104     "unroll-count", cl::Hidden,
105     cl::desc("Use this unroll count for all loops including those with "
106              "unroll_count pragma values, for testing purposes"));
107 
108 static cl::opt<unsigned> UnrollMaxCount(
109     "unroll-max-count", cl::Hidden,
110     cl::desc("Set the max unroll count for partial and runtime unrolling, for"
111              "testing purposes"));
112 
113 static cl::opt<unsigned> UnrollFullMaxCount(
114     "unroll-full-max-count", cl::Hidden,
115     cl::desc(
116         "Set the max unroll count for full unrolling, for testing purposes"));
117 
118 static cl::opt<unsigned> UnrollPeelCount(
119     "unroll-peel-count", cl::Hidden,
120     cl::desc("Set the unroll peeling count, for testing purposes"));
121 
122 static cl::opt<bool>
123     UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
124                        cl::desc("Allows loops to be partially unrolled until "
125                                 "-unroll-threshold loop size is reached."));
126 
127 static cl::opt<bool> UnrollAllowRemainder(
128     "unroll-allow-remainder", cl::Hidden,
129     cl::desc("Allow generation of a loop remainder (extra iterations) "
130              "when unrolling a loop."));
131 
132 static cl::opt<bool>
133     UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
134                   cl::desc("Unroll loops with run-time trip counts"));
135 
136 static cl::opt<unsigned> UnrollMaxUpperBound(
137     "unroll-max-upperbound", cl::init(8), cl::Hidden,
138     cl::desc(
139         "The max of trip count upper bound that is considered in unrolling"));
140 
141 static cl::opt<unsigned> PragmaUnrollThreshold(
142     "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
143     cl::desc("Unrolled size limit for loops with an unroll(full) or "
144              "unroll_count pragma."));
145 
146 static cl::opt<unsigned> FlatLoopTripCountThreshold(
147     "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
148     cl::desc("If the runtime tripcount for the loop is lower than the "
149              "threshold, the loop is considered as flat and will be less "
150              "aggressively unrolled."));
151 
152 static cl::opt<bool>
153     UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
154                        cl::desc("Allows loops to be peeled when the dynamic "
155                                 "trip count is known to be low."));
156 
157 static cl::opt<bool> UnrollUnrollRemainder(
158   "unroll-remainder", cl::Hidden,
159   cl::desc("Allow the loop remainder to be unrolled."));
160 
161 // This option isn't ever intended to be enabled, it serves to allow
162 // experiments to check the assumptions about when this kind of revisit is
163 // necessary.
164 static cl::opt<bool> UnrollRevisitChildLoops(
165     "unroll-revisit-child-loops", cl::Hidden,
166     cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
167              "This shouldn't typically be needed as child loops (or their "
168              "clones) were already visited."));
169 
170 static cl::opt<unsigned> UnrollThresholdAggressive(
171     "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
172     cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
173              "optimizations"));
174 static cl::opt<unsigned>
175     UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
176                            cl::Hidden,
177                            cl::desc("Default threshold (max size of unrolled "
178                                     "loop), used in all but O3 optimizations"));
179 
180 /// A magic value for use with the Threshold parameter to indicate
181 /// that the loop unroll should be performed regardless of how much
182 /// code expansion would result.
183 static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
184 
185 /// Gather the various unrolling parameters based on the defaults, compiler
186 /// flags, TTI overrides and user specified parameters.
187 TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
188     Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
189     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
190     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
191     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
192     Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling,
193     Optional<bool> UserAllowProfileBasedPeeling,
194     Optional<unsigned> UserFullUnrollMaxCount) {
195   TargetTransformInfo::UnrollingPreferences UP;
196 
197   // Set up the defaults
198   UP.Threshold =
199       OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
200   UP.MaxPercentThresholdBoost = 400;
201   UP.OptSizeThreshold = 0;
202   UP.PartialThreshold = 150;
203   UP.PartialOptSizeThreshold = 0;
204   UP.Count = 0;
205   UP.PeelCount = 0;
206   UP.DefaultUnrollRuntimeCount = 8;
207   UP.MaxCount = std::numeric_limits<unsigned>::max();
208   UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
209   UP.BEInsns = 2;
210   UP.Partial = false;
211   UP.Runtime = false;
212   UP.AllowRemainder = true;
213   UP.UnrollRemainder = false;
214   UP.AllowExpensiveTripCount = false;
215   UP.Force = false;
216   UP.UpperBound = false;
217   UP.AllowPeeling = true;
218   UP.UnrollAndJam = false;
219   UP.PeelProfiledIterations = true;
220   UP.UnrollAndJamInnerLoopThreshold = 60;
221 
222   // Override with any target specific settings
223   TTI.getUnrollingPreferences(L, SE, UP);
224 
225   // Apply size attributes
226   bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
227                     llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
228                                                 PGSOQueryType::IRPass);
229   if (OptForSize) {
230     UP.Threshold = UP.OptSizeThreshold;
231     UP.PartialThreshold = UP.PartialOptSizeThreshold;
232     UP.MaxPercentThresholdBoost = 100;
233   }
234 
235   // Apply any user values specified by cl::opt
236   if (UnrollThreshold.getNumOccurrences() > 0)
237     UP.Threshold = UnrollThreshold;
238   if (UnrollPartialThreshold.getNumOccurrences() > 0)
239     UP.PartialThreshold = UnrollPartialThreshold;
240   if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
241     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
242   if (UnrollMaxCount.getNumOccurrences() > 0)
243     UP.MaxCount = UnrollMaxCount;
244   if (UnrollFullMaxCount.getNumOccurrences() > 0)
245     UP.FullUnrollMaxCount = UnrollFullMaxCount;
246   if (UnrollPeelCount.getNumOccurrences() > 0)
247     UP.PeelCount = UnrollPeelCount;
248   if (UnrollAllowPartial.getNumOccurrences() > 0)
249     UP.Partial = UnrollAllowPartial;
250   if (UnrollAllowRemainder.getNumOccurrences() > 0)
251     UP.AllowRemainder = UnrollAllowRemainder;
252   if (UnrollRuntime.getNumOccurrences() > 0)
253     UP.Runtime = UnrollRuntime;
254   if (UnrollMaxUpperBound == 0)
255     UP.UpperBound = false;
256   if (UnrollAllowPeeling.getNumOccurrences() > 0)
257     UP.AllowPeeling = UnrollAllowPeeling;
258   if (UnrollUnrollRemainder.getNumOccurrences() > 0)
259     UP.UnrollRemainder = UnrollUnrollRemainder;
260 
261   // Apply user values provided by argument
262   if (UserThreshold.hasValue()) {
263     UP.Threshold = *UserThreshold;
264     UP.PartialThreshold = *UserThreshold;
265   }
266   if (UserCount.hasValue())
267     UP.Count = *UserCount;
268   if (UserAllowPartial.hasValue())
269     UP.Partial = *UserAllowPartial;
270   if (UserRuntime.hasValue())
271     UP.Runtime = *UserRuntime;
272   if (UserUpperBound.hasValue())
273     UP.UpperBound = *UserUpperBound;
274   if (UserAllowPeeling.hasValue())
275     UP.AllowPeeling = *UserAllowPeeling;
276   if (UserAllowProfileBasedPeeling.hasValue())
277     UP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
278   if (UserFullUnrollMaxCount.hasValue())
279     UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
280 
281   return UP;
282 }
283 
284 namespace {
285 
286 /// A struct to densely store the state of an instruction after unrolling at
287 /// each iteration.
288 ///
289 /// This is designed to work like a tuple of <Instruction *, int> for the
290 /// purposes of hashing and lookup, but to be able to associate two boolean
291 /// states with each key.
292 struct UnrolledInstState {
293   Instruction *I;
294   int Iteration : 30;
295   unsigned IsFree : 1;
296   unsigned IsCounted : 1;
297 };
298 
299 /// Hashing and equality testing for a set of the instruction states.
300 struct UnrolledInstStateKeyInfo {
301   using PtrInfo = DenseMapInfo<Instruction *>;
302   using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
303 
304   static inline UnrolledInstState getEmptyKey() {
305     return {PtrInfo::getEmptyKey(), 0, 0, 0};
306   }
307 
308   static inline UnrolledInstState getTombstoneKey() {
309     return {PtrInfo::getTombstoneKey(), 0, 0, 0};
310   }
311 
312   static inline unsigned getHashValue(const UnrolledInstState &S) {
313     return PairInfo::getHashValue({S.I, S.Iteration});
314   }
315 
316   static inline bool isEqual(const UnrolledInstState &LHS,
317                              const UnrolledInstState &RHS) {
318     return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
319   }
320 };
321 
322 struct EstimatedUnrollCost {
323   /// The estimated cost after unrolling.
324   unsigned UnrolledCost;
325 
326   /// The estimated dynamic cost of executing the instructions in the
327   /// rolled form.
328   unsigned RolledDynamicCost;
329 };
330 
331 } // end anonymous namespace
332 
333 /// Figure out if the loop is worth full unrolling.
334 ///
335 /// Complete loop unrolling can make some loads constant, and we need to know
336 /// if that would expose any further optimization opportunities.  This routine
337 /// estimates this optimization.  It computes cost of unrolled loop
338 /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
339 /// dynamic cost we mean that we won't count costs of blocks that are known not
340 /// to be executed (i.e. if we have a branch in the loop and we know that at the
341 /// given iteration its condition would be resolved to true, we won't add up the
342 /// cost of the 'false'-block).
343 /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
344 /// the analysis failed (no benefits expected from the unrolling, or the loop is
345 /// too big to analyze), the returned value is None.
346 static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
347     const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
348     const SmallPtrSetImpl<const Value *> &EphValues,
349     const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) {
350   // We want to be able to scale offsets by the trip count and add more offsets
351   // to them without checking for overflows, and we already don't want to
352   // analyze *massive* trip counts, so we force the max to be reasonably small.
353   assert(UnrollMaxIterationsCountToAnalyze <
354              (unsigned)(std::numeric_limits<int>::max() / 2) &&
355          "The unroll iterations max is too large!");
356 
357   // Only analyze inner loops. We can't properly estimate cost of nested loops
358   // and we won't visit inner loops again anyway.
359   if (!L->empty())
360     return None;
361 
362   // Don't simulate loops with a big or unknown tripcount
363   if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
364       TripCount > UnrollMaxIterationsCountToAnalyze)
365     return None;
366 
367   SmallSetVector<BasicBlock *, 16> BBWorklist;
368   SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
369   DenseMap<Value *, Constant *> SimplifiedValues;
370   SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
371 
372   // The estimated cost of the unrolled form of the loop. We try to estimate
373   // this by simplifying as much as we can while computing the estimate.
374   unsigned UnrolledCost = 0;
375 
376   // We also track the estimated dynamic (that is, actually executed) cost in
377   // the rolled form. This helps identify cases when the savings from unrolling
378   // aren't just exposing dead control flows, but actual reduced dynamic
379   // instructions due to the simplifications which we expect to occur after
380   // unrolling.
381   unsigned RolledDynamicCost = 0;
382 
383   // We track the simplification of each instruction in each iteration. We use
384   // this to recursively merge costs into the unrolled cost on-demand so that
385   // we don't count the cost of any dead code. This is essentially a map from
386   // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
387   DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
388 
389   // A small worklist used to accumulate cost of instructions from each
390   // observable and reached root in the loop.
391   SmallVector<Instruction *, 16> CostWorklist;
392 
393   // PHI-used worklist used between iterations while accumulating cost.
394   SmallVector<Instruction *, 4> PHIUsedList;
395 
396   // Helper function to accumulate cost for instructions in the loop.
397   auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
398     assert(Iteration >= 0 && "Cannot have a negative iteration!");
399     assert(CostWorklist.empty() && "Must start with an empty cost list");
400     assert(PHIUsedList.empty() && "Must start with an empty phi used list");
401     CostWorklist.push_back(&RootI);
402     for (;; --Iteration) {
403       do {
404         Instruction *I = CostWorklist.pop_back_val();
405 
406         // InstCostMap only uses I and Iteration as a key, the other two values
407         // don't matter here.
408         auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
409         if (CostIter == InstCostMap.end())
410           // If an input to a PHI node comes from a dead path through the loop
411           // we may have no cost data for it here. What that actually means is
412           // that it is free.
413           continue;
414         auto &Cost = *CostIter;
415         if (Cost.IsCounted)
416           // Already counted this instruction.
417           continue;
418 
419         // Mark that we are counting the cost of this instruction now.
420         Cost.IsCounted = true;
421 
422         // If this is a PHI node in the loop header, just add it to the PHI set.
423         if (auto *PhiI = dyn_cast<PHINode>(I))
424           if (PhiI->getParent() == L->getHeader()) {
425             assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
426                                   "inherently simplify during unrolling.");
427             if (Iteration == 0)
428               continue;
429 
430             // Push the incoming value from the backedge into the PHI used list
431             // if it is an in-loop instruction. We'll use this to populate the
432             // cost worklist for the next iteration (as we count backwards).
433             if (auto *OpI = dyn_cast<Instruction>(
434                     PhiI->getIncomingValueForBlock(L->getLoopLatch())))
435               if (L->contains(OpI))
436                 PHIUsedList.push_back(OpI);
437             continue;
438           }
439 
440         // First accumulate the cost of this instruction.
441         if (!Cost.IsFree) {
442           UnrolledCost += TTI.getUserCost(I);
443           LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
444                             << Iteration << "): ");
445           LLVM_DEBUG(I->dump());
446         }
447 
448         // We must count the cost of every operand which is not free,
449         // recursively. If we reach a loop PHI node, simply add it to the set
450         // to be considered on the next iteration (backwards!).
451         for (Value *Op : I->operands()) {
452           // Check whether this operand is free due to being a constant or
453           // outside the loop.
454           auto *OpI = dyn_cast<Instruction>(Op);
455           if (!OpI || !L->contains(OpI))
456             continue;
457 
458           // Otherwise accumulate its cost.
459           CostWorklist.push_back(OpI);
460         }
461       } while (!CostWorklist.empty());
462 
463       if (PHIUsedList.empty())
464         // We've exhausted the search.
465         break;
466 
467       assert(Iteration > 0 &&
468              "Cannot track PHI-used values past the first iteration!");
469       CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
470       PHIUsedList.clear();
471     }
472   };
473 
474   // Ensure that we don't violate the loop structure invariants relied on by
475   // this analysis.
476   assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
477   assert(L->isLCSSAForm(DT) &&
478          "Must have loops in LCSSA form to track live-out values.");
479 
480   LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
481 
482   // Simulate execution of each iteration of the loop counting instructions,
483   // which would be simplified.
484   // Since the same load will take different values on different iterations,
485   // we literally have to go through all loop's iterations.
486   for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
487     LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
488 
489     // Prepare for the iteration by collecting any simplified entry or backedge
490     // inputs.
491     for (Instruction &I : *L->getHeader()) {
492       auto *PHI = dyn_cast<PHINode>(&I);
493       if (!PHI)
494         break;
495 
496       // The loop header PHI nodes must have exactly two input: one from the
497       // loop preheader and one from the loop latch.
498       assert(
499           PHI->getNumIncomingValues() == 2 &&
500           "Must have an incoming value only for the preheader and the latch.");
501 
502       Value *V = PHI->getIncomingValueForBlock(
503           Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
504       Constant *C = dyn_cast<Constant>(V);
505       if (Iteration != 0 && !C)
506         C = SimplifiedValues.lookup(V);
507       if (C)
508         SimplifiedInputValues.push_back({PHI, C});
509     }
510 
511     // Now clear and re-populate the map for the next iteration.
512     SimplifiedValues.clear();
513     while (!SimplifiedInputValues.empty())
514       SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
515 
516     UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
517 
518     BBWorklist.clear();
519     BBWorklist.insert(L->getHeader());
520     // Note that we *must not* cache the size, this loop grows the worklist.
521     for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
522       BasicBlock *BB = BBWorklist[Idx];
523 
524       // Visit all instructions in the given basic block and try to simplify
525       // it.  We don't change the actual IR, just count optimization
526       // opportunities.
527       for (Instruction &I : *BB) {
528         // These won't get into the final code - don't even try calculating the
529         // cost for them.
530         if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
531           continue;
532 
533         // Track this instruction's expected baseline cost when executing the
534         // rolled loop form.
535         RolledDynamicCost += TTI.getUserCost(&I);
536 
537         // Visit the instruction to analyze its loop cost after unrolling,
538         // and if the visitor returns true, mark the instruction as free after
539         // unrolling and continue.
540         bool IsFree = Analyzer.visit(I);
541         bool Inserted = InstCostMap.insert({&I, (int)Iteration,
542                                            (unsigned)IsFree,
543                                            /*IsCounted*/ false}).second;
544         (void)Inserted;
545         assert(Inserted && "Cannot have a state for an unvisited instruction!");
546 
547         if (IsFree)
548           continue;
549 
550         // Can't properly model a cost of a call.
551         // FIXME: With a proper cost model we should be able to do it.
552         if (auto *CI = dyn_cast<CallInst>(&I)) {
553           const Function *Callee = CI->getCalledFunction();
554           if (!Callee || TTI.isLoweredToCall(Callee)) {
555             LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
556             return None;
557           }
558         }
559 
560         // If the instruction might have a side-effect recursively account for
561         // the cost of it and all the instructions leading up to it.
562         if (I.mayHaveSideEffects())
563           AddCostRecursively(I, Iteration);
564 
565         // If unrolled body turns out to be too big, bail out.
566         if (UnrolledCost > MaxUnrolledLoopSize) {
567           LLVM_DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
568                             << "  UnrolledCost: " << UnrolledCost
569                             << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
570                             << "\n");
571           return None;
572         }
573       }
574 
575       Instruction *TI = BB->getTerminator();
576 
577       // Add in the live successors by first checking whether we have terminator
578       // that may be simplified based on the values simplified by this call.
579       BasicBlock *KnownSucc = nullptr;
580       if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
581         if (BI->isConditional()) {
582           if (Constant *SimpleCond =
583                   SimplifiedValues.lookup(BI->getCondition())) {
584             // Just take the first successor if condition is undef
585             if (isa<UndefValue>(SimpleCond))
586               KnownSucc = BI->getSuccessor(0);
587             else if (ConstantInt *SimpleCondVal =
588                          dyn_cast<ConstantInt>(SimpleCond))
589               KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
590           }
591         }
592       } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
593         if (Constant *SimpleCond =
594                 SimplifiedValues.lookup(SI->getCondition())) {
595           // Just take the first successor if condition is undef
596           if (isa<UndefValue>(SimpleCond))
597             KnownSucc = SI->getSuccessor(0);
598           else if (ConstantInt *SimpleCondVal =
599                        dyn_cast<ConstantInt>(SimpleCond))
600             KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
601         }
602       }
603       if (KnownSucc) {
604         if (L->contains(KnownSucc))
605           BBWorklist.insert(KnownSucc);
606         else
607           ExitWorklist.insert({BB, KnownSucc});
608         continue;
609       }
610 
611       // Add BB's successors to the worklist.
612       for (BasicBlock *Succ : successors(BB))
613         if (L->contains(Succ))
614           BBWorklist.insert(Succ);
615         else
616           ExitWorklist.insert({BB, Succ});
617       AddCostRecursively(*TI, Iteration);
618     }
619 
620     // If we found no optimization opportunities on the first iteration, we
621     // won't find them on later ones too.
622     if (UnrolledCost == RolledDynamicCost) {
623       LLVM_DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
624                         << "  UnrolledCost: " << UnrolledCost << "\n");
625       return None;
626     }
627   }
628 
629   while (!ExitWorklist.empty()) {
630     BasicBlock *ExitingBB, *ExitBB;
631     std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
632 
633     for (Instruction &I : *ExitBB) {
634       auto *PN = dyn_cast<PHINode>(&I);
635       if (!PN)
636         break;
637 
638       Value *Op = PN->getIncomingValueForBlock(ExitingBB);
639       if (auto *OpI = dyn_cast<Instruction>(Op))
640         if (L->contains(OpI))
641           AddCostRecursively(*OpI, TripCount - 1);
642     }
643   }
644 
645   LLVM_DEBUG(dbgs() << "Analysis finished:\n"
646                     << "UnrolledCost: " << UnrolledCost << ", "
647                     << "RolledDynamicCost: " << RolledDynamicCost << "\n");
648   return {{UnrolledCost, RolledDynamicCost}};
649 }
650 
651 /// ApproximateLoopSize - Approximate the size of the loop.
652 unsigned llvm::ApproximateLoopSize(
653     const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
654     const TargetTransformInfo &TTI,
655     const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
656   CodeMetrics Metrics;
657   for (BasicBlock *BB : L->blocks())
658     Metrics.analyzeBasicBlock(BB, TTI, EphValues);
659   NumCalls = Metrics.NumInlineCandidates;
660   NotDuplicatable = Metrics.notDuplicatable;
661   Convergent = Metrics.convergent;
662 
663   unsigned LoopSize = Metrics.NumInsts;
664 
665   // Don't allow an estimate of size zero.  This would allows unrolling of loops
666   // with huge iteration counts, which is a compile time problem even if it's
667   // not a problem for code quality. Also, the code using this size may assume
668   // that each loop has at least three instructions (likely a conditional
669   // branch, a comparison feeding that branch, and some kind of loop increment
670   // feeding that comparison instruction).
671   LoopSize = std::max(LoopSize, BEInsns + 1);
672 
673   return LoopSize;
674 }
675 
676 // Returns the loop hint metadata node with the given name (for example,
677 // "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
678 // returned.
679 static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
680   if (MDNode *LoopID = L->getLoopID())
681     return GetUnrollMetadata(LoopID, Name);
682   return nullptr;
683 }
684 
685 // Returns true if the loop has an unroll(full) pragma.
686 static bool hasUnrollFullPragma(const Loop *L) {
687   return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
688 }
689 
690 // Returns true if the loop has an unroll(enable) pragma. This metadata is used
691 // for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
692 static bool hasUnrollEnablePragma(const Loop *L) {
693   return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
694 }
695 
696 // Returns true if the loop has an runtime unroll(disable) pragma.
697 static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
698   return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
699 }
700 
701 // If loop has an unroll_count pragma return the (necessarily
702 // positive) value from the pragma.  Otherwise return 0.
703 static unsigned unrollCountPragmaValue(const Loop *L) {
704   MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
705   if (MD) {
706     assert(MD->getNumOperands() == 2 &&
707            "Unroll count hint metadata should have two operands.");
708     unsigned Count =
709         mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
710     assert(Count >= 1 && "Unroll count must be positive.");
711     return Count;
712   }
713   return 0;
714 }
715 
716 // Computes the boosting factor for complete unrolling.
717 // If fully unrolling the loop would save a lot of RolledDynamicCost, it would
718 // be beneficial to fully unroll the loop even if unrolledcost is large. We
719 // use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
720 // the unroll threshold.
721 static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
722                                             unsigned MaxPercentThresholdBoost) {
723   if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
724     return 100;
725   else if (Cost.UnrolledCost != 0)
726     // The boosting factor is RolledDynamicCost / UnrolledCost
727     return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
728                     MaxPercentThresholdBoost);
729   else
730     return MaxPercentThresholdBoost;
731 }
732 
733 // Returns loop size estimation for unrolled loop.
734 static uint64_t getUnrolledLoopSize(
735     unsigned LoopSize,
736     TargetTransformInfo::UnrollingPreferences &UP) {
737   assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
738   return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
739 }
740 
741 // Returns true if unroll count was set explicitly.
742 // Calculates unroll count and writes it to UP.Count.
743 // Unless IgnoreUser is true, will also use metadata and command-line options
744 // that are specific to to the LoopUnroll pass (which, for instance, are
745 // irrelevant for the LoopUnrollAndJam pass).
746 // FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
747 // many LoopUnroll-specific options. The shared functionality should be
748 // refactored into it own function.
749 bool llvm::computeUnrollCount(
750     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
751     ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
752     OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
753     bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
754     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
755 
756   // Check for explicit Count.
757   // 1st priority is unroll count set by "unroll-count" option.
758   bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
759   if (UserUnrollCount) {
760     UP.Count = UnrollCount;
761     UP.AllowExpensiveTripCount = true;
762     UP.Force = true;
763     if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
764       return true;
765   }
766 
767   // 2nd priority is unroll count set by pragma.
768   unsigned PragmaCount = unrollCountPragmaValue(L);
769   if (PragmaCount > 0) {
770     UP.Count = PragmaCount;
771     UP.Runtime = true;
772     UP.AllowExpensiveTripCount = true;
773     UP.Force = true;
774     if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
775         getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
776       return true;
777   }
778   bool PragmaFullUnroll = hasUnrollFullPragma(L);
779   if (PragmaFullUnroll && TripCount != 0) {
780     UP.Count = TripCount;
781     if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
782       return false;
783   }
784 
785   bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
786   bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
787                         PragmaEnableUnroll || UserUnrollCount;
788 
789   if (ExplicitUnroll && TripCount != 0) {
790     // If the loop has an unrolling pragma, we want to be more aggressive with
791     // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
792     // value which is larger than the default limits.
793     UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
794     UP.PartialThreshold =
795         std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
796   }
797 
798   // 3rd priority is full unroll count.
799   // Full unroll makes sense only when TripCount or its upper bound could be
800   // statically calculated.
801   // Also we need to check if we exceed FullUnrollMaxCount.
802   // If using the upper bound to unroll, TripMultiple should be set to 1 because
803   // we do not know when loop may exit.
804 
805   // We can unroll by the upper bound amount if it's generally allowed or if
806   // we know that the loop is executed either the upper bound or zero times.
807   // (MaxOrZero unrolling keeps only the first loop test, so the number of
808   // loop tests remains the same compared to the non-unrolled version, whereas
809   // the generic upper bound unrolling keeps all but the last loop test so the
810   // number of loop tests goes up which may end up being worse on targets with
811   // constrained branch predictor resources so is controlled by an option.)
812   // In addition we only unroll small upper bounds.
813   unsigned FullUnrollMaxTripCount = MaxTripCount;
814   if (!(UP.UpperBound || MaxOrZero) ||
815       FullUnrollMaxTripCount > UnrollMaxUpperBound)
816     FullUnrollMaxTripCount = 0;
817 
818   // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
819   // compute the former when the latter is zero.
820   unsigned ExactTripCount = TripCount;
821   assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
822          "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
823 
824   unsigned FullUnrollTripCount =
825       ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
826   UP.Count = FullUnrollTripCount;
827   if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
828     // When computing the unrolled size, note that BEInsns are not replicated
829     // like the rest of the loop body.
830     if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
831       UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
832       TripCount = FullUnrollTripCount;
833       TripMultiple = UP.UpperBound ? 1 : TripMultiple;
834       return ExplicitUnroll;
835     } else {
836       // The loop isn't that small, but we still can fully unroll it if that
837       // helps to remove a significant number of instructions.
838       // To check that, run additional analysis on the loop.
839       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
840               L, FullUnrollTripCount, DT, SE, EphValues, TTI,
841               UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
842         unsigned Boost =
843             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
844         if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
845           UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
846           TripCount = FullUnrollTripCount;
847           TripMultiple = UP.UpperBound ? 1 : TripMultiple;
848           return ExplicitUnroll;
849         }
850       }
851     }
852   }
853 
854   // 4th priority is loop peeling.
855   computePeelCount(L, LoopSize, UP, TripCount, SE);
856   if (UP.PeelCount) {
857     UP.Runtime = false;
858     UP.Count = 1;
859     return ExplicitUnroll;
860   }
861 
862   // 5th priority is partial unrolling.
863   // Try partial unroll only when TripCount could be statically calculated.
864   if (TripCount) {
865     UP.Partial |= ExplicitUnroll;
866     if (!UP.Partial) {
867       LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
868                         << "-unroll-allow-partial not given\n");
869       UP.Count = 0;
870       return false;
871     }
872     if (UP.Count == 0)
873       UP.Count = TripCount;
874     if (UP.PartialThreshold != NoThreshold) {
875       // Reduce unroll count to be modulo of TripCount for partial unrolling.
876       if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
877         UP.Count =
878             (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
879             (LoopSize - UP.BEInsns);
880       if (UP.Count > UP.MaxCount)
881         UP.Count = UP.MaxCount;
882       while (UP.Count != 0 && TripCount % UP.Count != 0)
883         UP.Count--;
884       if (UP.AllowRemainder && UP.Count <= 1) {
885         // If there is no Count that is modulo of TripCount, set Count to
886         // largest power-of-two factor that satisfies the threshold limit.
887         // As we'll create fixup loop, do the type of unrolling only if
888         // remainder loop is allowed.
889         UP.Count = UP.DefaultUnrollRuntimeCount;
890         while (UP.Count != 0 &&
891                getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
892           UP.Count >>= 1;
893       }
894       if (UP.Count < 2) {
895         if (PragmaEnableUnroll)
896           ORE->emit([&]() {
897             return OptimizationRemarkMissed(DEBUG_TYPE,
898                                             "UnrollAsDirectedTooLarge",
899                                             L->getStartLoc(), L->getHeader())
900                    << "Unable to unroll loop as directed by unroll(enable) "
901                       "pragma "
902                       "because unrolled size is too large.";
903           });
904         UP.Count = 0;
905       }
906     } else {
907       UP.Count = TripCount;
908     }
909     if (UP.Count > UP.MaxCount)
910       UP.Count = UP.MaxCount;
911     if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
912         UP.Count != TripCount)
913       ORE->emit([&]() {
914         return OptimizationRemarkMissed(DEBUG_TYPE,
915                                         "FullUnrollAsDirectedTooLarge",
916                                         L->getStartLoc(), L->getHeader())
917                << "Unable to fully unroll loop as directed by unroll pragma "
918                   "because "
919                   "unrolled size is too large.";
920       });
921     LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
922                       << "\n");
923     return ExplicitUnroll;
924   }
925   assert(TripCount == 0 &&
926          "All cases when TripCount is constant should be covered here.");
927   if (PragmaFullUnroll)
928     ORE->emit([&]() {
929       return OptimizationRemarkMissed(
930                  DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
931                  L->getStartLoc(), L->getHeader())
932              << "Unable to fully unroll loop as directed by unroll(full) "
933                 "pragma "
934                 "because loop has a runtime trip count.";
935     });
936 
937   // 6th priority is runtime unrolling.
938   // Don't unroll a runtime trip count loop when it is disabled.
939   if (hasRuntimeUnrollDisablePragma(L)) {
940     UP.Count = 0;
941     return false;
942   }
943 
944   // Don't unroll a small upper bound loop unless user or TTI asked to do so.
945   if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
946     UP.Count = 0;
947     return false;
948   }
949 
950   // Check if the runtime trip count is too small when profile is available.
951   if (L->getHeader()->getParent()->hasProfileData()) {
952     if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
953       if (*ProfileTripCount < FlatLoopTripCountThreshold)
954         return false;
955       else
956         UP.AllowExpensiveTripCount = true;
957     }
958   }
959 
960   // Reduce count based on the type of unrolling and the threshold values.
961   UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
962   if (!UP.Runtime) {
963     LLVM_DEBUG(
964         dbgs() << "  will not try to unroll loop with runtime trip count "
965                << "-unroll-runtime not given\n");
966     UP.Count = 0;
967     return false;
968   }
969   if (UP.Count == 0)
970     UP.Count = UP.DefaultUnrollRuntimeCount;
971 
972   // Reduce unroll count to be the largest power-of-two factor of
973   // the original count which satisfies the threshold limit.
974   while (UP.Count != 0 &&
975          getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
976     UP.Count >>= 1;
977 
978 #ifndef NDEBUG
979   unsigned OrigCount = UP.Count;
980 #endif
981 
982   if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
983     while (UP.Count != 0 && TripMultiple % UP.Count != 0)
984       UP.Count >>= 1;
985     LLVM_DEBUG(
986         dbgs() << "Remainder loop is restricted (that could architecture "
987                   "specific or because the loop contains a convergent "
988                   "instruction), so unroll count must divide the trip "
989                   "multiple, "
990                << TripMultiple << ".  Reducing unroll count from " << OrigCount
991                << " to " << UP.Count << ".\n");
992 
993     using namespace ore;
994 
995     if (PragmaCount > 0 && !UP.AllowRemainder)
996       ORE->emit([&]() {
997         return OptimizationRemarkMissed(DEBUG_TYPE,
998                                         "DifferentUnrollCountFromDirected",
999                                         L->getStartLoc(), L->getHeader())
1000                << "Unable to unroll loop the number of times directed by "
1001                   "unroll_count pragma because remainder loop is restricted "
1002                   "(that could architecture specific or because the loop "
1003                   "contains a convergent instruction) and so must have an "
1004                   "unroll "
1005                   "count that divides the loop trip multiple of "
1006                << NV("TripMultiple", TripMultiple) << ".  Unrolling instead "
1007                << NV("UnrollCount", UP.Count) << " time(s).";
1008       });
1009   }
1010 
1011   if (UP.Count > UP.MaxCount)
1012     UP.Count = UP.MaxCount;
1013 
1014   if (MaxTripCount && UP.Count > MaxTripCount)
1015     UP.Count = MaxTripCount;
1016 
1017   LLVM_DEBUG(dbgs() << "  runtime unrolling with count: " << UP.Count
1018                     << "\n");
1019   if (UP.Count < 2)
1020     UP.Count = 0;
1021   return ExplicitUnroll;
1022 }
1023 
1024 static LoopUnrollResult tryToUnrollLoop(
1025     Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
1026     const TargetTransformInfo &TTI, AssumptionCache &AC,
1027     OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
1028     ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1029     bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
1030     Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
1031     Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
1032     Optional<bool> ProvidedAllowPeeling,
1033     Optional<bool> ProvidedAllowProfileBasedPeeling,
1034     Optional<unsigned> ProvidedFullUnrollMaxCount) {
1035   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1036                     << L->getHeader()->getParent()->getName() << "] Loop %"
1037                     << L->getHeader()->getName() << "\n");
1038   TransformationMode TM = hasUnrollTransformation(L);
1039   if (TM & TM_Disable)
1040     return LoopUnrollResult::Unmodified;
1041   if (!L->isLoopSimplifyForm()) {
1042     LLVM_DEBUG(
1043         dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
1044     return LoopUnrollResult::Unmodified;
1045   }
1046 
1047   // When automtatic unrolling is disabled, do not unroll unless overridden for
1048   // this loop.
1049   if (OnlyWhenForced && !(TM & TM_Enable))
1050     return LoopUnrollResult::Unmodified;
1051 
1052   bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1053   unsigned NumInlineCandidates;
1054   bool NotDuplicatable;
1055   bool Convergent;
1056   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
1057       L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
1058       ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1059       ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
1060       ProvidedFullUnrollMaxCount);
1061 
1062   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1063   // as threshold later on.
1064   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1065       !OptForSize)
1066     return LoopUnrollResult::Unmodified;
1067 
1068   SmallPtrSet<const Value *, 32> EphValues;
1069   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
1070 
1071   unsigned LoopSize =
1072       ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
1073                           TTI, EphValues, UP.BEInsns);
1074   LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
1075   if (NotDuplicatable) {
1076     LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
1077                       << " instructions.\n");
1078     return LoopUnrollResult::Unmodified;
1079   }
1080 
1081   // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1082   // later), to (fully) unroll loops, if it does not increase code size.
1083   if (OptForSize)
1084     UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
1085 
1086   if (NumInlineCandidates != 0) {
1087     LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
1088     return LoopUnrollResult::Unmodified;
1089   }
1090 
1091   // Find trip count and trip multiple if count is not available
1092   unsigned TripCount = 0;
1093   unsigned TripMultiple = 1;
1094   // If there are multiple exiting blocks but one of them is the latch, use the
1095   // latch for the trip count estimation. Otherwise insist on a single exiting
1096   // block for the trip count estimation.
1097   BasicBlock *ExitingBlock = L->getLoopLatch();
1098   if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1099     ExitingBlock = L->getExitingBlock();
1100   if (ExitingBlock) {
1101     TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
1102     TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1103   }
1104 
1105   // If the loop contains a convergent operation, the prelude we'd add
1106   // to do the first few instructions before we hit the unrolled loop
1107   // is unsafe -- it adds a control-flow dependency to the convergent
1108   // operation.  Therefore restrict remainder loop (try unrollig without).
1109   //
1110   // TODO: This is quite conservative.  In practice, convergent_op()
1111   // is likely to be called unconditionally in the loop.  In this
1112   // case, the program would be ill-formed (on most architectures)
1113   // unless n were the same on all threads in a thread group.
1114   // Assuming n is the same on all threads, any kind of unrolling is
1115   // safe.  But currently llvm's notion of convergence isn't powerful
1116   // enough to express this.
1117   if (Convergent)
1118     UP.AllowRemainder = false;
1119 
1120   // Try to find the trip count upper bound if we cannot find the exact trip
1121   // count.
1122   unsigned MaxTripCount = 0;
1123   bool MaxOrZero = false;
1124   if (!TripCount) {
1125     MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1126     MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1127   }
1128 
1129   // computeUnrollCount() decides whether it is beneficial to use upper bound to
1130   // fully unroll the loop.
1131   bool UseUpperBound = false;
1132   bool IsCountSetExplicitly = computeUnrollCount(
1133       L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
1134       TripMultiple, LoopSize, UP, UseUpperBound);
1135   if (!UP.Count)
1136     return LoopUnrollResult::Unmodified;
1137   // Unroll factor (Count) must be less or equal to TripCount.
1138   if (TripCount && UP.Count > TripCount)
1139     UP.Count = TripCount;
1140 
1141   // Save loop properties before it is transformed.
1142   MDNode *OrigLoopID = L->getLoopID();
1143 
1144   // Unroll the loop.
1145   Loop *RemainderLoop = nullptr;
1146   LoopUnrollResult UnrollResult = UnrollLoop(
1147       L,
1148       {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
1149        UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
1150        ForgetAllSCEV},
1151       LI, &SE, &DT, &AC, &ORE, PreserveLCSSA, &RemainderLoop);
1152   if (UnrollResult == LoopUnrollResult::Unmodified)
1153     return LoopUnrollResult::Unmodified;
1154 
1155   if (RemainderLoop) {
1156     Optional<MDNode *> RemainderLoopID =
1157         makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
1158                                         LLVMLoopUnrollFollowupRemainder});
1159     if (RemainderLoopID.hasValue())
1160       RemainderLoop->setLoopID(RemainderLoopID.getValue());
1161   }
1162 
1163   if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1164     Optional<MDNode *> NewLoopID =
1165         makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
1166                                         LLVMLoopUnrollFollowupUnrolled});
1167     if (NewLoopID.hasValue()) {
1168       L->setLoopID(NewLoopID.getValue());
1169 
1170       // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1171       // explicitly.
1172       return UnrollResult;
1173     }
1174   }
1175 
1176   // If loop has an unroll count pragma or unrolled by explicitly set count
1177   // mark loop as unrolled to prevent unrolling beyond that requested.
1178   // If the loop was peeled, we already "used up" the profile information
1179   // we had, so we don't want to unroll or peel again.
1180   if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
1181       (IsCountSetExplicitly || (UP.PeelProfiledIterations && UP.PeelCount)))
1182     L->setLoopAlreadyUnrolled();
1183 
1184   return UnrollResult;
1185 }
1186 
1187 namespace {
1188 
1189 class LoopUnroll : public LoopPass {
1190 public:
1191   static char ID; // Pass ID, replacement for typeid
1192 
1193   int OptLevel;
1194 
1195   /// If false, use a cost model to determine whether unrolling of a loop is
1196   /// profitable. If true, only loops that explicitly request unrolling via
1197   /// metadata are considered. All other loops are skipped.
1198   bool OnlyWhenForced;
1199 
1200   /// If false, when SCEV is invalidated, only forget everything in the
1201   /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1202   /// Otherwise, forgetAllLoops and rebuild when needed next.
1203   bool ForgetAllSCEV;
1204 
1205   Optional<unsigned> ProvidedCount;
1206   Optional<unsigned> ProvidedThreshold;
1207   Optional<bool> ProvidedAllowPartial;
1208   Optional<bool> ProvidedRuntime;
1209   Optional<bool> ProvidedUpperBound;
1210   Optional<bool> ProvidedAllowPeeling;
1211   Optional<bool> ProvidedAllowProfileBasedPeeling;
1212   Optional<unsigned> ProvidedFullUnrollMaxCount;
1213 
1214   LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1215              bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
1216              Optional<unsigned> Count = None,
1217              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
1218              Optional<bool> UpperBound = None,
1219              Optional<bool> AllowPeeling = None,
1220              Optional<bool> AllowProfileBasedPeeling = None,
1221              Optional<unsigned> ProvidedFullUnrollMaxCount = None)
1222       : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1223         ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1224         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1225         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1226         ProvidedAllowPeeling(AllowPeeling),
1227         ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1228         ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1229     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
1230   }
1231 
1232   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1233     if (skipLoop(L))
1234       return false;
1235 
1236     Function &F = *L->getHeader()->getParent();
1237 
1238     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1239     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1240     ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1241     const TargetTransformInfo &TTI =
1242         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1243     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1244     // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1245     // pass.  Function analyses need to be preserved across loop transformations
1246     // but ORE cannot be preserved (see comment before the pass definition).
1247     OptimizationRemarkEmitter ORE(&F);
1248     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1249 
1250     LoopUnrollResult Result = tryToUnrollLoop(
1251         L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
1252         OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
1253         ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1254         ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
1255         ProvidedFullUnrollMaxCount);
1256 
1257     if (Result == LoopUnrollResult::FullyUnrolled)
1258       LPM.markLoopAsDeleted(*L);
1259 
1260     return Result != LoopUnrollResult::Unmodified;
1261   }
1262 
1263   /// This transformation requires natural loop information & requires that
1264   /// loop preheaders be inserted into the CFG...
1265   void getAnalysisUsage(AnalysisUsage &AU) const override {
1266     AU.addRequired<AssumptionCacheTracker>();
1267     AU.addRequired<TargetTransformInfoWrapperPass>();
1268     // FIXME: Loop passes are required to preserve domtree, and for now we just
1269     // recreate dom info if anything gets unrolled.
1270     getLoopAnalysisUsage(AU);
1271   }
1272 };
1273 
1274 } // end anonymous namespace
1275 
1276 char LoopUnroll::ID = 0;
1277 
1278 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1279 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1280 INITIALIZE_PASS_DEPENDENCY(LoopPass)
1281 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1282 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1283 
1284 Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1285                                  bool ForgetAllSCEV, int Threshold, int Count,
1286                                  int AllowPartial, int Runtime, int UpperBound,
1287                                  int AllowPeeling) {
1288   // TODO: It would make more sense for this function to take the optionals
1289   // directly, but that's dangerous since it would silently break out of tree
1290   // callers.
1291   return new LoopUnroll(
1292       OptLevel, OnlyWhenForced, ForgetAllSCEV,
1293       Threshold == -1 ? None : Optional<unsigned>(Threshold),
1294       Count == -1 ? None : Optional<unsigned>(Count),
1295       AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
1296       Runtime == -1 ? None : Optional<bool>(Runtime),
1297       UpperBound == -1 ? None : Optional<bool>(UpperBound),
1298       AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
1299 }
1300 
1301 Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1302                                        bool ForgetAllSCEV) {
1303   return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
1304                               0, 0, 0, 0);
1305 }
1306 
1307 PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
1308                                           LoopStandardAnalysisResults &AR,
1309                                           LPMUpdater &Updater) {
1310   // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
1311   // pass. Function analyses need to be preserved across loop transformations
1312   // but ORE cannot be preserved (see comment before the pass definition).
1313   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
1314 
1315   // Keep track of the previous loop structure so we can identify new loops
1316   // created by unrolling.
1317   Loop *ParentL = L.getParentLoop();
1318   SmallPtrSet<Loop *, 4> OldLoops;
1319   if (ParentL)
1320     OldLoops.insert(ParentL->begin(), ParentL->end());
1321   else
1322     OldLoops.insert(AR.LI.begin(), AR.LI.end());
1323 
1324   std::string LoopName = std::string(L.getName());
1325 
1326   bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
1327                                  /*BFI*/ nullptr, /*PSI*/ nullptr,
1328                                  /*PreserveLCSSA*/ true, OptLevel,
1329                                  OnlyWhenForced, ForgetSCEV, /*Count*/ None,
1330                                  /*Threshold*/ None, /*AllowPartial*/ false,
1331                                  /*Runtime*/ false, /*UpperBound*/ false,
1332                                  /*AllowPeeling*/ false,
1333                                  /*AllowProfileBasedPeeling*/ false,
1334                                  /*FullUnrollMaxCount*/ None) !=
1335                  LoopUnrollResult::Unmodified;
1336   if (!Changed)
1337     return PreservedAnalyses::all();
1338 
1339   // The parent must not be damaged by unrolling!
1340 #ifndef NDEBUG
1341   if (ParentL)
1342     ParentL->verifyLoop();
1343 #endif
1344 
1345   // Unrolling can do several things to introduce new loops into a loop nest:
1346   // - Full unrolling clones child loops within the current loop but then
1347   //   removes the current loop making all of the children appear to be new
1348   //   sibling loops.
1349   //
1350   // When a new loop appears as a sibling loop after fully unrolling,
1351   // its nesting structure has fundamentally changed and we want to revisit
1352   // it to reflect that.
1353   //
1354   // When unrolling has removed the current loop, we need to tell the
1355   // infrastructure that it is gone.
1356   //
1357   // Finally, we support a debugging/testing mode where we revisit child loops
1358   // as well. These are not expected to require further optimizations as either
1359   // they or the loop they were cloned from have been directly visited already.
1360   // But the debugging mode allows us to check this assumption.
1361   bool IsCurrentLoopValid = false;
1362   SmallVector<Loop *, 4> SibLoops;
1363   if (ParentL)
1364     SibLoops.append(ParentL->begin(), ParentL->end());
1365   else
1366     SibLoops.append(AR.LI.begin(), AR.LI.end());
1367   erase_if(SibLoops, [&](Loop *SibLoop) {
1368     if (SibLoop == &L) {
1369       IsCurrentLoopValid = true;
1370       return true;
1371     }
1372 
1373     // Otherwise erase the loop from the list if it was in the old loops.
1374     return OldLoops.count(SibLoop) != 0;
1375   });
1376   Updater.addSiblingLoops(SibLoops);
1377 
1378   if (!IsCurrentLoopValid) {
1379     Updater.markLoopAsDeleted(L, LoopName);
1380   } else {
1381     // We can only walk child loops if the current loop remained valid.
1382     if (UnrollRevisitChildLoops) {
1383       // Walk *all* of the child loops.
1384       SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1385       Updater.addChildLoops(ChildLoops);
1386     }
1387   }
1388 
1389   return getLoopPassPreservedAnalyses();
1390 }
1391 
1392 PreservedAnalyses LoopUnrollPass::run(Function &F,
1393                                       FunctionAnalysisManager &AM) {
1394   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1395   auto &LI = AM.getResult<LoopAnalysis>(F);
1396   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
1397   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1398   auto &AC = AM.getResult<AssumptionAnalysis>(F);
1399   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1400 
1401   LoopAnalysisManager *LAM = nullptr;
1402   if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
1403     LAM = &LAMProxy->getManager();
1404 
1405   const ModuleAnalysisManager &MAM =
1406       AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
1407   ProfileSummaryInfo *PSI =
1408       MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
1409   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1410       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
1411 
1412   bool Changed = false;
1413 
1414   // The unroller requires loops to be in simplified form, and also needs LCSSA.
1415   // Since simplification may add new inner loops, it has to run before the
1416   // legality and profitability checks. This means running the loop unroller
1417   // will simplify all loops, regardless of whether anything end up being
1418   // unrolled.
1419   for (auto &L : LI) {
1420     Changed |=
1421         simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
1422     Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
1423   }
1424 
1425   // Add the loop nests in the reverse order of LoopInfo. See method
1426   // declaration.
1427   SmallPriorityWorklist<Loop *, 4> Worklist;
1428   appendLoopsToWorklist(LI, Worklist);
1429 
1430   while (!Worklist.empty()) {
1431     // Because the LoopInfo stores the loops in RPO, we walk the worklist
1432     // from back to front so that we work forward across the CFG, which
1433     // for unrolling is only needed to get optimization remarks emitted in
1434     // a forward order.
1435     Loop &L = *Worklist.pop_back_val();
1436 #ifndef NDEBUG
1437     Loop *ParentL = L.getParentLoop();
1438 #endif
1439 
1440     // Check if the profile summary indicates that the profiled application
1441     // has a huge working set size, in which case we disable peeling to avoid
1442     // bloating it further.
1443     Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1444     if (PSI && PSI->hasHugeWorkingSetSize())
1445       LocalAllowPeeling = false;
1446     std::string LoopName = std::string(L.getName());
1447     // The API here is quite complex to call and we allow to select some
1448     // flavors of unrolling during construction time (by setting UnrollOpts).
1449     LoopUnrollResult Result = tryToUnrollLoop(
1450         &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
1451         /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
1452         UnrollOpts.ForgetSCEV, /*Count*/ None,
1453         /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
1454         UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1455         UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
1456     Changed |= Result != LoopUnrollResult::Unmodified;
1457 
1458     // The parent must not be damaged by unrolling!
1459 #ifndef NDEBUG
1460     if (Result != LoopUnrollResult::Unmodified && ParentL)
1461       ParentL->verifyLoop();
1462 #endif
1463 
1464     // Clear any cached analysis results for L if we removed it completely.
1465     if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1466       LAM->clear(L, LoopName);
1467   }
1468 
1469   if (!Changed)
1470     return PreservedAnalyses::all();
1471 
1472   return getLoopPassPreservedAnalyses();
1473 }
1474