xref: /llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (revision ad9da92cf6f735747ef04fd56937e1d76819e503)
1 //===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements a simple loop unroller.  It works best when loops have
10 // been canonicalized by the -indvars pass, allowing it to determine the trip
11 // counts of loops easily.
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseMapInfo.h"
17 #include "llvm/ADT/DenseSet.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SetVector.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/StringRef.h"
23 #include "llvm/Analysis/AssumptionCache.h"
24 #include "llvm/Analysis/BlockFrequencyInfo.h"
25 #include "llvm/Analysis/CodeMetrics.h"
26 #include "llvm/Analysis/LoopAnalysisManager.h"
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/Analysis/LoopPass.h"
29 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
30 #include "llvm/Analysis/MemorySSA.h"
31 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
32 #include "llvm/Analysis/ProfileSummaryInfo.h"
33 #include "llvm/Analysis/ScalarEvolution.h"
34 #include "llvm/Analysis/TargetTransformInfo.h"
35 #include "llvm/IR/BasicBlock.h"
36 #include "llvm/IR/CFG.h"
37 #include "llvm/IR/Constant.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/DiagnosticInfo.h"
40 #include "llvm/IR/Dominators.h"
41 #include "llvm/IR/Function.h"
42 #include "llvm/IR/Instruction.h"
43 #include "llvm/IR/Instructions.h"
44 #include "llvm/IR/IntrinsicInst.h"
45 #include "llvm/IR/Metadata.h"
46 #include "llvm/IR/PassManager.h"
47 #include "llvm/InitializePasses.h"
48 #include "llvm/Pass.h"
49 #include "llvm/Support/Casting.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/Debug.h"
52 #include "llvm/Support/ErrorHandling.h"
53 #include "llvm/Support/raw_ostream.h"
54 #include "llvm/Transforms/Scalar.h"
55 #include "llvm/Transforms/Scalar/LoopPassManager.h"
56 #include "llvm/Transforms/Utils.h"
57 #include "llvm/Transforms/Utils/LoopPeel.h"
58 #include "llvm/Transforms/Utils/LoopSimplify.h"
59 #include "llvm/Transforms/Utils/LoopUtils.h"
60 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
61 #include "llvm/Transforms/Utils/SizeOpts.h"
62 #include "llvm/Transforms/Utils/UnrollLoop.h"
63 #include <algorithm>
64 #include <cassert>
65 #include <cstdint>
66 #include <limits>
67 #include <optional>
68 #include <string>
69 #include <tuple>
70 #include <utility>
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "loop-unroll"
75 
76 cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
77     "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
78     cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
79              " the current top-most loop. This is sometimes preferred to reduce"
80              " compile time."));
81 
82 static cl::opt<unsigned>
83     UnrollThreshold("unroll-threshold", cl::Hidden,
84                     cl::desc("The cost threshold for loop unrolling"));
85 
86 static cl::opt<unsigned>
87     UnrollOptSizeThreshold(
88       "unroll-optsize-threshold", cl::init(0), cl::Hidden,
89       cl::desc("The cost threshold for loop unrolling when optimizing for "
90                "size"));
91 
92 static cl::opt<unsigned> UnrollPartialThreshold(
93     "unroll-partial-threshold", cl::Hidden,
94     cl::desc("The cost threshold for partial loop unrolling"));
95 
96 static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
97     "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
98     cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
99              "to the threshold when aggressively unrolling a loop due to the "
100              "dynamic cost savings. If completely unrolling a loop will reduce "
101              "the total runtime from X to Y, we boost the loop unroll "
102              "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
103              "X/Y). This limit avoids excessive code bloat."));
104 
105 static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
106     "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
107     cl::desc("Don't allow loop unrolling to simulate more than this number of "
108              "iterations when checking full unroll profitability"));
109 
110 static cl::opt<unsigned> UnrollCount(
111     "unroll-count", cl::Hidden,
112     cl::desc("Use this unroll count for all loops including those with "
113              "unroll_count pragma values, for testing purposes"));
114 
115 static cl::opt<unsigned> UnrollMaxCount(
116     "unroll-max-count", cl::Hidden,
117     cl::desc("Set the max unroll count for partial and runtime unrolling, for"
118              "testing purposes"));
119 
120 static cl::opt<unsigned> UnrollFullMaxCount(
121     "unroll-full-max-count", cl::Hidden,
122     cl::desc(
123         "Set the max unroll count for full unrolling, for testing purposes"));
124 
125 static cl::opt<bool>
126     UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
127                        cl::desc("Allows loops to be partially unrolled until "
128                                 "-unroll-threshold loop size is reached."));
129 
130 static cl::opt<bool> UnrollAllowRemainder(
131     "unroll-allow-remainder", cl::Hidden,
132     cl::desc("Allow generation of a loop remainder (extra iterations) "
133              "when unrolling a loop."));
134 
135 static cl::opt<bool>
136     UnrollRuntime("unroll-runtime", cl::Hidden,
137                   cl::desc("Unroll loops with run-time trip counts"));
138 
139 static cl::opt<unsigned> UnrollMaxUpperBound(
140     "unroll-max-upperbound", cl::init(8), cl::Hidden,
141     cl::desc(
142         "The max of trip count upper bound that is considered in unrolling"));
143 
144 static cl::opt<unsigned> PragmaUnrollThreshold(
145     "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
146     cl::desc("Unrolled size limit for loops with an unroll(full) or "
147              "unroll_count pragma."));
148 
149 static cl::opt<unsigned> FlatLoopTripCountThreshold(
150     "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
151     cl::desc("If the runtime tripcount for the loop is lower than the "
152              "threshold, the loop is considered as flat and will be less "
153              "aggressively unrolled."));
154 
155 static cl::opt<bool> UnrollUnrollRemainder(
156   "unroll-remainder", cl::Hidden,
157   cl::desc("Allow the loop remainder to be unrolled."));
158 
159 // This option isn't ever intended to be enabled, it serves to allow
160 // experiments to check the assumptions about when this kind of revisit is
161 // necessary.
162 static cl::opt<bool> UnrollRevisitChildLoops(
163     "unroll-revisit-child-loops", cl::Hidden,
164     cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
165              "This shouldn't typically be needed as child loops (or their "
166              "clones) were already visited."));
167 
168 static cl::opt<unsigned> UnrollThresholdAggressive(
169     "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
170     cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
171              "optimizations"));
172 static cl::opt<unsigned>
173     UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
174                            cl::Hidden,
175                            cl::desc("Default threshold (max size of unrolled "
176                                     "loop), used in all but O3 optimizations"));
177 
178 static cl::opt<unsigned> PragmaUnrollFullMaxIterations(
179     "pragma-unroll-full-max-iterations", cl::init(1'000'000), cl::Hidden,
180     cl::desc("Maximum allowed iterations to unroll under pragma unroll full."));
181 
182 /// A magic value for use with the Threshold parameter to indicate
183 /// that the loop unroll should be performed regardless of how much
184 /// code expansion would result.
185 static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
186 
187 /// Gather the various unrolling parameters based on the defaults, compiler
188 /// flags, TTI overrides and user specified parameters.
189 TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
190     Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
191     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
192     OptimizationRemarkEmitter &ORE, int OptLevel,
193     std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,
194     std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,
195     std::optional<bool> UserUpperBound,
196     std::optional<unsigned> UserFullUnrollMaxCount) {
197   TargetTransformInfo::UnrollingPreferences UP;
198 
199   // Set up the defaults
200   UP.Threshold =
201       OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
202   UP.MaxPercentThresholdBoost = 400;
203   UP.OptSizeThreshold = UnrollOptSizeThreshold;
204   UP.PartialThreshold = 150;
205   UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;
206   UP.Count = 0;
207   UP.DefaultUnrollRuntimeCount = 8;
208   UP.MaxCount = std::numeric_limits<unsigned>::max();
209   UP.MaxUpperBound = UnrollMaxUpperBound;
210   UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
211   UP.BEInsns = 2;
212   UP.Partial = false;
213   UP.Runtime = false;
214   UP.AllowRemainder = true;
215   UP.UnrollRemainder = false;
216   UP.AllowExpensiveTripCount = false;
217   UP.Force = false;
218   UP.UpperBound = false;
219   UP.UnrollAndJam = false;
220   UP.UnrollAndJamInnerLoopThreshold = 60;
221   UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
222   UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
223   UP.RuntimeUnrollMultiExit = false;
224 
225   // Override with any target specific settings
226   TTI.getUnrollingPreferences(L, SE, UP, &ORE);
227 
228   // Apply size attributes
229   bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
230                     // Let unroll hints / pragmas take precedence over PGSO.
231                     (hasUnrollTransformation(L) != TM_ForcedByUser &&
232                      llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
233                                                  PGSOQueryType::IRPass));
234   if (OptForSize) {
235     UP.Threshold = UP.OptSizeThreshold;
236     UP.PartialThreshold = UP.PartialOptSizeThreshold;
237     UP.MaxPercentThresholdBoost = 100;
238   }
239 
240   // Apply any user values specified by cl::opt
241   if (UnrollThreshold.getNumOccurrences() > 0)
242     UP.Threshold = UnrollThreshold;
243   if (UnrollPartialThreshold.getNumOccurrences() > 0)
244     UP.PartialThreshold = UnrollPartialThreshold;
245   if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
246     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
247   if (UnrollMaxCount.getNumOccurrences() > 0)
248     UP.MaxCount = UnrollMaxCount;
249   if (UnrollMaxUpperBound.getNumOccurrences() > 0)
250     UP.MaxUpperBound = UnrollMaxUpperBound;
251   if (UnrollFullMaxCount.getNumOccurrences() > 0)
252     UP.FullUnrollMaxCount = UnrollFullMaxCount;
253   if (UnrollAllowPartial.getNumOccurrences() > 0)
254     UP.Partial = UnrollAllowPartial;
255   if (UnrollAllowRemainder.getNumOccurrences() > 0)
256     UP.AllowRemainder = UnrollAllowRemainder;
257   if (UnrollRuntime.getNumOccurrences() > 0)
258     UP.Runtime = UnrollRuntime;
259   if (UnrollMaxUpperBound == 0)
260     UP.UpperBound = false;
261   if (UnrollUnrollRemainder.getNumOccurrences() > 0)
262     UP.UnrollRemainder = UnrollUnrollRemainder;
263   if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
264     UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
265 
266   // Apply user values provided by argument
267   if (UserThreshold) {
268     UP.Threshold = *UserThreshold;
269     UP.PartialThreshold = *UserThreshold;
270   }
271   if (UserCount)
272     UP.Count = *UserCount;
273   if (UserAllowPartial)
274     UP.Partial = *UserAllowPartial;
275   if (UserRuntime)
276     UP.Runtime = *UserRuntime;
277   if (UserUpperBound)
278     UP.UpperBound = *UserUpperBound;
279   if (UserFullUnrollMaxCount)
280     UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
281 
282   return UP;
283 }
284 
285 namespace {
286 
287 /// A struct to densely store the state of an instruction after unrolling at
288 /// each iteration.
289 ///
290 /// This is designed to work like a tuple of <Instruction *, int> for the
291 /// purposes of hashing and lookup, but to be able to associate two boolean
292 /// states with each key.
293 struct UnrolledInstState {
294   Instruction *I;
295   int Iteration : 30;
296   unsigned IsFree : 1;
297   unsigned IsCounted : 1;
298 };
299 
300 /// Hashing and equality testing for a set of the instruction states.
301 struct UnrolledInstStateKeyInfo {
302   using PtrInfo = DenseMapInfo<Instruction *>;
303   using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
304 
305   static inline UnrolledInstState getEmptyKey() {
306     return {PtrInfo::getEmptyKey(), 0, 0, 0};
307   }
308 
309   static inline UnrolledInstState getTombstoneKey() {
310     return {PtrInfo::getTombstoneKey(), 0, 0, 0};
311   }
312 
313   static inline unsigned getHashValue(const UnrolledInstState &S) {
314     return PairInfo::getHashValue({S.I, S.Iteration});
315   }
316 
317   static inline bool isEqual(const UnrolledInstState &LHS,
318                              const UnrolledInstState &RHS) {
319     return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
320   }
321 };
322 
323 struct EstimatedUnrollCost {
324   /// The estimated cost after unrolling.
325   unsigned UnrolledCost;
326 
327   /// The estimated dynamic cost of executing the instructions in the
328   /// rolled form.
329   unsigned RolledDynamicCost;
330 };
331 
332 struct PragmaInfo {
333   PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU)
334       : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),
335         PragmaEnableUnroll(PEU) {}
336   const bool UserUnrollCount;
337   const bool PragmaFullUnroll;
338   const unsigned PragmaCount;
339   const bool PragmaEnableUnroll;
340 };
341 
342 } // end anonymous namespace
343 
344 /// Figure out if the loop is worth full unrolling.
345 ///
346 /// Complete loop unrolling can make some loads constant, and we need to know
347 /// if that would expose any further optimization opportunities.  This routine
348 /// estimates this optimization.  It computes cost of unrolled loop
349 /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
350 /// dynamic cost we mean that we won't count costs of blocks that are known not
351 /// to be executed (i.e. if we have a branch in the loop and we know that at the
352 /// given iteration its condition would be resolved to true, we won't add up the
353 /// cost of the 'false'-block).
354 /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
355 /// the analysis failed (no benefits expected from the unrolling, or the loop is
356 /// too big to analyze), the returned value is std::nullopt.
357 static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
358     const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
359     const SmallPtrSetImpl<const Value *> &EphValues,
360     const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
361     unsigned MaxIterationsCountToAnalyze) {
362   // We want to be able to scale offsets by the trip count and add more offsets
363   // to them without checking for overflows, and we already don't want to
364   // analyze *massive* trip counts, so we force the max to be reasonably small.
365   assert(MaxIterationsCountToAnalyze <
366              (unsigned)(std::numeric_limits<int>::max() / 2) &&
367          "The unroll iterations max is too large!");
368 
369   // Only analyze inner loops. We can't properly estimate cost of nested loops
370   // and we won't visit inner loops again anyway.
371   if (!L->isInnermost())
372     return std::nullopt;
373 
374   // Don't simulate loops with a big or unknown tripcount
375   if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
376     return std::nullopt;
377 
378   SmallSetVector<BasicBlock *, 16> BBWorklist;
379   SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
380   DenseMap<Value *, Value *> SimplifiedValues;
381   SmallVector<std::pair<Value *, Value *>, 4> SimplifiedInputValues;
382 
383   // The estimated cost of the unrolled form of the loop. We try to estimate
384   // this by simplifying as much as we can while computing the estimate.
385   InstructionCost UnrolledCost = 0;
386 
387   // We also track the estimated dynamic (that is, actually executed) cost in
388   // the rolled form. This helps identify cases when the savings from unrolling
389   // aren't just exposing dead control flows, but actual reduced dynamic
390   // instructions due to the simplifications which we expect to occur after
391   // unrolling.
392   InstructionCost RolledDynamicCost = 0;
393 
394   // We track the simplification of each instruction in each iteration. We use
395   // this to recursively merge costs into the unrolled cost on-demand so that
396   // we don't count the cost of any dead code. This is essentially a map from
397   // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
398   DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
399 
400   // A small worklist used to accumulate cost of instructions from each
401   // observable and reached root in the loop.
402   SmallVector<Instruction *, 16> CostWorklist;
403 
404   // PHI-used worklist used between iterations while accumulating cost.
405   SmallVector<Instruction *, 4> PHIUsedList;
406 
407   // Helper function to accumulate cost for instructions in the loop.
408   auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
409     assert(Iteration >= 0 && "Cannot have a negative iteration!");
410     assert(CostWorklist.empty() && "Must start with an empty cost list");
411     assert(PHIUsedList.empty() && "Must start with an empty phi used list");
412     CostWorklist.push_back(&RootI);
413     TargetTransformInfo::TargetCostKind CostKind =
414       RootI.getFunction()->hasMinSize() ?
415       TargetTransformInfo::TCK_CodeSize :
416       TargetTransformInfo::TCK_SizeAndLatency;
417     for (;; --Iteration) {
418       do {
419         Instruction *I = CostWorklist.pop_back_val();
420 
421         // InstCostMap only uses I and Iteration as a key, the other two values
422         // don't matter here.
423         auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
424         if (CostIter == InstCostMap.end())
425           // If an input to a PHI node comes from a dead path through the loop
426           // we may have no cost data for it here. What that actually means is
427           // that it is free.
428           continue;
429         auto &Cost = *CostIter;
430         if (Cost.IsCounted)
431           // Already counted this instruction.
432           continue;
433 
434         // Mark that we are counting the cost of this instruction now.
435         Cost.IsCounted = true;
436 
437         // If this is a PHI node in the loop header, just add it to the PHI set.
438         if (auto *PhiI = dyn_cast<PHINode>(I))
439           if (PhiI->getParent() == L->getHeader()) {
440             assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
441                                   "inherently simplify during unrolling.");
442             if (Iteration == 0)
443               continue;
444 
445             // Push the incoming value from the backedge into the PHI used list
446             // if it is an in-loop instruction. We'll use this to populate the
447             // cost worklist for the next iteration (as we count backwards).
448             if (auto *OpI = dyn_cast<Instruction>(
449                     PhiI->getIncomingValueForBlock(L->getLoopLatch())))
450               if (L->contains(OpI))
451                 PHIUsedList.push_back(OpI);
452             continue;
453           }
454 
455         // First accumulate the cost of this instruction.
456         if (!Cost.IsFree) {
457           // Consider simplified operands in instruction cost.
458           SmallVector<Value *, 4> Operands;
459           transform(I->operands(), std::back_inserter(Operands),
460                     [&](Value *Op) {
461                       if (auto Res = SimplifiedValues.lookup(Op))
462                         return Res;
463                       return Op;
464                     });
465           UnrolledCost += TTI.getInstructionCost(I, Operands, CostKind);
466           LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
467                             << Iteration << "): ");
468           LLVM_DEBUG(I->dump());
469         }
470 
471         // We must count the cost of every operand which is not free,
472         // recursively. If we reach a loop PHI node, simply add it to the set
473         // to be considered on the next iteration (backwards!).
474         for (Value *Op : I->operands()) {
475           // Check whether this operand is free due to being a constant or
476           // outside the loop.
477           auto *OpI = dyn_cast<Instruction>(Op);
478           if (!OpI || !L->contains(OpI))
479             continue;
480 
481           // Otherwise accumulate its cost.
482           CostWorklist.push_back(OpI);
483         }
484       } while (!CostWorklist.empty());
485 
486       if (PHIUsedList.empty())
487         // We've exhausted the search.
488         break;
489 
490       assert(Iteration > 0 &&
491              "Cannot track PHI-used values past the first iteration!");
492       CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
493       PHIUsedList.clear();
494     }
495   };
496 
497   // Ensure that we don't violate the loop structure invariants relied on by
498   // this analysis.
499   assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
500   assert(L->isLCSSAForm(DT) &&
501          "Must have loops in LCSSA form to track live-out values.");
502 
503   LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
504 
505   TargetTransformInfo::TargetCostKind CostKind =
506     L->getHeader()->getParent()->hasMinSize() ?
507     TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;
508   // Simulate execution of each iteration of the loop counting instructions,
509   // which would be simplified.
510   // Since the same load will take different values on different iterations,
511   // we literally have to go through all loop's iterations.
512   for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
513     LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
514 
515     // Prepare for the iteration by collecting any simplified entry or backedge
516     // inputs.
517     for (Instruction &I : *L->getHeader()) {
518       auto *PHI = dyn_cast<PHINode>(&I);
519       if (!PHI)
520         break;
521 
522       // The loop header PHI nodes must have exactly two input: one from the
523       // loop preheader and one from the loop latch.
524       assert(
525           PHI->getNumIncomingValues() == 2 &&
526           "Must have an incoming value only for the preheader and the latch.");
527 
528       Value *V = PHI->getIncomingValueForBlock(
529           Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
530       if (Iteration != 0 && SimplifiedValues.count(V))
531         V = SimplifiedValues.lookup(V);
532       SimplifiedInputValues.push_back({PHI, V});
533     }
534 
535     // Now clear and re-populate the map for the next iteration.
536     SimplifiedValues.clear();
537     while (!SimplifiedInputValues.empty())
538       SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
539 
540     UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
541 
542     BBWorklist.clear();
543     BBWorklist.insert(L->getHeader());
544     // Note that we *must not* cache the size, this loop grows the worklist.
545     for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
546       BasicBlock *BB = BBWorklist[Idx];
547 
548       // Visit all instructions in the given basic block and try to simplify
549       // it.  We don't change the actual IR, just count optimization
550       // opportunities.
551       for (Instruction &I : *BB) {
552         // These won't get into the final code - don't even try calculating the
553         // cost for them.
554         if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
555           continue;
556 
557         // Track this instruction's expected baseline cost when executing the
558         // rolled loop form.
559         RolledDynamicCost += TTI.getInstructionCost(&I, CostKind);
560 
561         // Visit the instruction to analyze its loop cost after unrolling,
562         // and if the visitor returns true, mark the instruction as free after
563         // unrolling and continue.
564         bool IsFree = Analyzer.visit(I);
565         bool Inserted = InstCostMap.insert({&I, (int)Iteration,
566                                            (unsigned)IsFree,
567                                            /*IsCounted*/ false}).second;
568         (void)Inserted;
569         assert(Inserted && "Cannot have a state for an unvisited instruction!");
570 
571         if (IsFree)
572           continue;
573 
574         // Can't properly model a cost of a call.
575         // FIXME: With a proper cost model we should be able to do it.
576         if (auto *CI = dyn_cast<CallInst>(&I)) {
577           const Function *Callee = CI->getCalledFunction();
578           if (!Callee || TTI.isLoweredToCall(Callee)) {
579             LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
580             return std::nullopt;
581           }
582         }
583 
584         // If the instruction might have a side-effect recursively account for
585         // the cost of it and all the instructions leading up to it.
586         if (I.mayHaveSideEffects())
587           AddCostRecursively(I, Iteration);
588 
589         // If unrolled body turns out to be too big, bail out.
590         if (UnrolledCost > MaxUnrolledLoopSize) {
591           LLVM_DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
592                             << "  UnrolledCost: " << UnrolledCost
593                             << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
594                             << "\n");
595           return std::nullopt;
596         }
597       }
598 
599       Instruction *TI = BB->getTerminator();
600 
601       auto getSimplifiedConstant = [&](Value *V) -> Constant * {
602         if (SimplifiedValues.count(V))
603           V = SimplifiedValues.lookup(V);
604         return dyn_cast<Constant>(V);
605       };
606 
607       // Add in the live successors by first checking whether we have terminator
608       // that may be simplified based on the values simplified by this call.
609       BasicBlock *KnownSucc = nullptr;
610       if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
611         if (BI->isConditional()) {
612           if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
613             // Just take the first successor if condition is undef
614             if (isa<UndefValue>(SimpleCond))
615               KnownSucc = BI->getSuccessor(0);
616             else if (ConstantInt *SimpleCondVal =
617                          dyn_cast<ConstantInt>(SimpleCond))
618               KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
619           }
620         }
621       } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
622         if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
623           // Just take the first successor if condition is undef
624           if (isa<UndefValue>(SimpleCond))
625             KnownSucc = SI->getSuccessor(0);
626           else if (ConstantInt *SimpleCondVal =
627                        dyn_cast<ConstantInt>(SimpleCond))
628             KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
629         }
630       }
631       if (KnownSucc) {
632         if (L->contains(KnownSucc))
633           BBWorklist.insert(KnownSucc);
634         else
635           ExitWorklist.insert({BB, KnownSucc});
636         continue;
637       }
638 
639       // Add BB's successors to the worklist.
640       for (BasicBlock *Succ : successors(BB))
641         if (L->contains(Succ))
642           BBWorklist.insert(Succ);
643         else
644           ExitWorklist.insert({BB, Succ});
645       AddCostRecursively(*TI, Iteration);
646     }
647 
648     // If we found no optimization opportunities on the first iteration, we
649     // won't find them on later ones too.
650     if (UnrolledCost == RolledDynamicCost) {
651       LLVM_DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
652                         << "  UnrolledCost: " << UnrolledCost << "\n");
653       return std::nullopt;
654     }
655   }
656 
657   while (!ExitWorklist.empty()) {
658     BasicBlock *ExitingBB, *ExitBB;
659     std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
660 
661     for (Instruction &I : *ExitBB) {
662       auto *PN = dyn_cast<PHINode>(&I);
663       if (!PN)
664         break;
665 
666       Value *Op = PN->getIncomingValueForBlock(ExitingBB);
667       if (auto *OpI = dyn_cast<Instruction>(Op))
668         if (L->contains(OpI))
669           AddCostRecursively(*OpI, TripCount - 1);
670     }
671   }
672 
673   assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&
674          "All instructions must have a valid cost, whether the "
675          "loop is rolled or unrolled.");
676 
677   LLVM_DEBUG(dbgs() << "Analysis finished:\n"
678                     << "UnrolledCost: " << UnrolledCost << ", "
679                     << "RolledDynamicCost: " << RolledDynamicCost << "\n");
680   return {{unsigned(*UnrolledCost.getValue()),
681            unsigned(*RolledDynamicCost.getValue())}};
682 }
683 
684 UnrollCostEstimator::UnrollCostEstimator(
685     const Loop *L, const TargetTransformInfo &TTI,
686     const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
687   CodeMetrics Metrics;
688   for (BasicBlock *BB : L->blocks())
689     Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false,
690                               L);
691   NumInlineCandidates = Metrics.NumInlineCandidates;
692   NotDuplicatable = Metrics.notDuplicatable;
693   Convergence = Metrics.Convergence;
694   LoopSize = Metrics.NumInsts;
695   ConvergenceAllowsRuntime =
696       Metrics.Convergence != ConvergenceKind::Uncontrolled &&
697       !getLoopConvergenceHeart(L);
698 
699   // Don't allow an estimate of size zero.  This would allows unrolling of loops
700   // with huge iteration counts, which is a compile time problem even if it's
701   // not a problem for code quality. Also, the code using this size may assume
702   // that each loop has at least three instructions (likely a conditional
703   // branch, a comparison feeding that branch, and some kind of loop increment
704   // feeding that comparison instruction).
705   if (LoopSize.isValid() && LoopSize < BEInsns + 1)
706     // This is an open coded max() on InstructionCost
707     LoopSize = BEInsns + 1;
708 }
709 
710 bool UnrollCostEstimator::canUnroll() const {
711   switch (Convergence) {
712   case ConvergenceKind::ExtendedLoop:
713     LLVM_DEBUG(dbgs() << "  Convergence prevents unrolling.\n");
714     return false;
715   default:
716     break;
717   }
718   if (!LoopSize.isValid()) {
719     LLVM_DEBUG(dbgs() << "  Invalid loop size prevents unrolling.\n");
720     return false;
721   }
722   if (NotDuplicatable) {
723     LLVM_DEBUG(dbgs() << "  Non-duplicatable blocks prevent unrolling.\n");
724     return false;
725   }
726   return true;
727 }
728 
729 uint64_t UnrollCostEstimator::getUnrolledLoopSize(
730     const TargetTransformInfo::UnrollingPreferences &UP,
731     unsigned CountOverwrite) const {
732   unsigned LS = *LoopSize.getValue();
733   assert(LS >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
734   if (CountOverwrite)
735     return static_cast<uint64_t>(LS - UP.BEInsns) * CountOverwrite + UP.BEInsns;
736   else
737     return static_cast<uint64_t>(LS - UP.BEInsns) * UP.Count + UP.BEInsns;
738 }
739 
740 // Returns the loop hint metadata node with the given name (for example,
741 // "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
742 // returned.
743 static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
744   if (MDNode *LoopID = L->getLoopID())
745     return GetUnrollMetadata(LoopID, Name);
746   return nullptr;
747 }
748 
749 // Returns true if the loop has an unroll(full) pragma.
750 static bool hasUnrollFullPragma(const Loop *L) {
751   return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
752 }
753 
754 // Returns true if the loop has an unroll(enable) pragma. This metadata is used
755 // for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
756 static bool hasUnrollEnablePragma(const Loop *L) {
757   return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
758 }
759 
760 // Returns true if the loop has an runtime unroll(disable) pragma.
761 static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
762   return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
763 }
764 
765 // If loop has an unroll_count pragma return the (necessarily
766 // positive) value from the pragma.  Otherwise return 0.
767 static unsigned unrollCountPragmaValue(const Loop *L) {
768   MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
769   if (MD) {
770     assert(MD->getNumOperands() == 2 &&
771            "Unroll count hint metadata should have two operands.");
772     unsigned Count =
773         mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
774     assert(Count >= 1 && "Unroll count must be positive.");
775     return Count;
776   }
777   return 0;
778 }
779 
780 // Computes the boosting factor for complete unrolling.
781 // If fully unrolling the loop would save a lot of RolledDynamicCost, it would
782 // be beneficial to fully unroll the loop even if unrolledcost is large. We
783 // use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
784 // the unroll threshold.
785 static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
786                                             unsigned MaxPercentThresholdBoost) {
787   if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
788     return 100;
789   else if (Cost.UnrolledCost != 0)
790     // The boosting factor is RolledDynamicCost / UnrolledCost
791     return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
792                     MaxPercentThresholdBoost);
793   else
794     return MaxPercentThresholdBoost;
795 }
796 
797 static std::optional<unsigned>
798 shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
799                    const unsigned TripMultiple, const unsigned TripCount,
800                    unsigned MaxTripCount, const UnrollCostEstimator UCE,
801                    const TargetTransformInfo::UnrollingPreferences &UP) {
802 
803   // Using unroll pragma
804   // 1st priority is unroll count set by "unroll-count" option.
805 
806   if (PInfo.UserUnrollCount) {
807     if (UP.AllowRemainder &&
808         UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold)
809       return (unsigned)UnrollCount;
810   }
811 
812   // 2nd priority is unroll count set by pragma.
813   if (PInfo.PragmaCount > 0) {
814     if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)))
815       return PInfo.PragmaCount;
816   }
817 
818   if (PInfo.PragmaFullUnroll && TripCount != 0) {
819     // Certain cases with UBSAN can cause trip count to be calculated as
820     // INT_MAX, Block full unrolling at a reasonable limit so that the compiler
821     // doesn't hang trying to unroll the loop. See PR77842
822     if (TripCount > PragmaUnrollFullMaxIterations) {
823       LLVM_DEBUG(dbgs() << "Won't unroll; trip count is too large\n");
824       return std::nullopt;
825     }
826 
827     return TripCount;
828   }
829 
830   if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
831       MaxTripCount <= UP.MaxUpperBound)
832     return MaxTripCount;
833 
834   // if didn't return until here, should continue to other priorties
835   return std::nullopt;
836 }
837 
838 static std::optional<unsigned> shouldFullUnroll(
839     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
840     ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
841     const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
842     const TargetTransformInfo::UnrollingPreferences &UP) {
843   assert(FullUnrollTripCount && "should be non-zero!");
844 
845   if (FullUnrollTripCount > UP.FullUnrollMaxCount)
846     return std::nullopt;
847 
848   // When computing the unrolled size, note that BEInsns are not replicated
849   // like the rest of the loop body.
850   if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
851     return FullUnrollTripCount;
852 
853   // The loop isn't that small, but we still can fully unroll it if that
854   // helps to remove a significant number of instructions.
855   // To check that, run additional analysis on the loop.
856   if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
857           L, FullUnrollTripCount, DT, SE, EphValues, TTI,
858           UP.Threshold * UP.MaxPercentThresholdBoost / 100,
859           UP.MaxIterationsCountToAnalyze)) {
860     unsigned Boost =
861       getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
862     if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
863       return FullUnrollTripCount;
864   }
865   return std::nullopt;
866 }
867 
868 static std::optional<unsigned>
869 shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
870                     const UnrollCostEstimator UCE,
871                     const TargetTransformInfo::UnrollingPreferences &UP) {
872 
873   if (!TripCount)
874     return std::nullopt;
875 
876   if (!UP.Partial) {
877     LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
878                << "-unroll-allow-partial not given\n");
879     return 0;
880   }
881   unsigned count = UP.Count;
882   if (count == 0)
883     count = TripCount;
884   if (UP.PartialThreshold != NoThreshold) {
885     // Reduce unroll count to be modulo of TripCount for partial unrolling.
886     if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
887       count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
888         (LoopSize - UP.BEInsns);
889     if (count > UP.MaxCount)
890       count = UP.MaxCount;
891     while (count != 0 && TripCount % count != 0)
892       count--;
893     if (UP.AllowRemainder && count <= 1) {
894       // If there is no Count that is modulo of TripCount, set Count to
895       // largest power-of-two factor that satisfies the threshold limit.
896       // As we'll create fixup loop, do the type of unrolling only if
897       // remainder loop is allowed.
898       count = UP.DefaultUnrollRuntimeCount;
899       while (count != 0 &&
900              UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
901         count >>= 1;
902     }
903     if (count < 2) {
904       count = 0;
905     }
906   } else {
907     count = TripCount;
908   }
909   if (count > UP.MaxCount)
910     count = UP.MaxCount;
911 
912   LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << count << "\n");
913 
914   return count;
915 }
916 // Returns true if unroll count was set explicitly.
917 // Calculates unroll count and writes it to UP.Count.
918 // Unless IgnoreUser is true, will also use metadata and command-line options
919 // that are specific to to the LoopUnroll pass (which, for instance, are
920 // irrelevant for the LoopUnrollAndJam pass).
921 // FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
922 // many LoopUnroll-specific options. The shared functionality should be
923 // refactored into it own function.
924 bool llvm::computeUnrollCount(
925     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
926     AssumptionCache *AC, ScalarEvolution &SE,
927     const SmallPtrSetImpl<const Value *> &EphValues,
928     OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
929     bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE,
930     TargetTransformInfo::UnrollingPreferences &UP,
931     TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
932 
933   unsigned LoopSize = UCE.getRolledLoopSize();
934 
935   const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
936   const bool PragmaFullUnroll = hasUnrollFullPragma(L);
937   const unsigned PragmaCount = unrollCountPragmaValue(L);
938   const bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
939 
940   const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
941                               PragmaEnableUnroll || UserUnrollCount;
942 
943   PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,
944                    PragmaEnableUnroll);
945   // Use an explicit peel count that has been specified for testing. In this
946   // case it's not permitted to also specify an explicit unroll count.
947   if (PP.PeelCount) {
948     if (UnrollCount.getNumOccurrences() > 0) {
949       report_fatal_error("Cannot specify both explicit peel count and "
950                          "explicit unroll count", /*GenCrashDiag=*/false);
951     }
952     UP.Count = 1;
953     UP.Runtime = false;
954     return true;
955   }
956   // Check for explicit Count.
957   // 1st priority is unroll count set by "unroll-count" option.
958   // 2nd priority is unroll count set by pragma.
959   if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
960                                              MaxTripCount, UCE, UP)) {
961     UP.Count = *UnrollFactor;
962 
963     if (UserUnrollCount || (PragmaCount > 0)) {
964       UP.AllowExpensiveTripCount = true;
965       UP.Force = true;
966     }
967     UP.Runtime |= (PragmaCount > 0);
968     return ExplicitUnroll;
969   } else {
970     if (ExplicitUnroll && TripCount != 0) {
971       // If the loop has an unrolling pragma, we want to be more aggressive with
972       // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
973       // value which is larger than the default limits.
974       UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
975       UP.PartialThreshold =
976           std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
977     }
978   }
979 
980   // 3rd priority is exact full unrolling.  This will eliminate all copies
981   // of some exit test.
982   UP.Count = 0;
983   if (TripCount) {
984     UP.Count = TripCount;
985     if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
986                                              TripCount, UCE, UP)) {
987       UP.Count = *UnrollFactor;
988       UseUpperBound = false;
989       return ExplicitUnroll;
990     }
991   }
992 
993   // 4th priority is bounded unrolling.
994   // We can unroll by the upper bound amount if it's generally allowed or if
995   // we know that the loop is executed either the upper bound or zero times.
996   // (MaxOrZero unrolling keeps only the first loop test, so the number of
997   // loop tests remains the same compared to the non-unrolled version, whereas
998   // the generic upper bound unrolling keeps all but the last loop test so the
999   // number of loop tests goes up which may end up being worse on targets with
1000   // constrained branch predictor resources so is controlled by an option.)
1001   // In addition we only unroll small upper bounds.
1002   // Note that the cost of bounded unrolling is always strictly greater than
1003   // cost of exact full unrolling.  As such, if we have an exact count and
1004   // found it unprofitable, we'll never chose to bounded unroll.
1005   if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
1006       MaxTripCount <= UP.MaxUpperBound) {
1007     UP.Count = MaxTripCount;
1008     if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
1009                                              MaxTripCount, UCE, UP)) {
1010       UP.Count = *UnrollFactor;
1011       UseUpperBound = true;
1012       return ExplicitUnroll;
1013     }
1014   }
1015 
1016   // 5th priority is loop peeling.
1017   computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);
1018   if (PP.PeelCount) {
1019     UP.Runtime = false;
1020     UP.Count = 1;
1021     return ExplicitUnroll;
1022   }
1023 
1024   // Before starting partial unrolling, set up.partial to true,
1025   // if user explicitly asked  for unrolling
1026   if (TripCount)
1027     UP.Partial |= ExplicitUnroll;
1028 
1029   // 6th priority is partial unrolling.
1030   // Try partial unroll only when TripCount could be statically calculated.
1031   if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
1032     UP.Count = *UnrollFactor;
1033 
1034     if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
1035         UP.Count != TripCount)
1036       ORE->emit([&]() {
1037         return OptimizationRemarkMissed(DEBUG_TYPE,
1038                                         "FullUnrollAsDirectedTooLarge",
1039                                         L->getStartLoc(), L->getHeader())
1040                << "Unable to fully unroll loop as directed by unroll pragma "
1041                   "because "
1042                   "unrolled size is too large.";
1043       });
1044 
1045     if (UP.PartialThreshold != NoThreshold) {
1046       if (UP.Count == 0) {
1047         if (PragmaEnableUnroll)
1048           ORE->emit([&]() {
1049             return OptimizationRemarkMissed(DEBUG_TYPE,
1050                                             "UnrollAsDirectedTooLarge",
1051                                             L->getStartLoc(), L->getHeader())
1052                    << "Unable to unroll loop as directed by unroll(enable) "
1053                       "pragma "
1054                       "because unrolled size is too large.";
1055           });
1056       }
1057     }
1058     return ExplicitUnroll;
1059   }
1060   assert(TripCount == 0 &&
1061          "All cases when TripCount is constant should be covered here.");
1062   if (PragmaFullUnroll)
1063     ORE->emit([&]() {
1064       return OptimizationRemarkMissed(
1065                  DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
1066                  L->getStartLoc(), L->getHeader())
1067              << "Unable to fully unroll loop as directed by unroll(full) "
1068                 "pragma "
1069                 "because loop has a runtime trip count.";
1070     });
1071 
1072   // 7th priority is runtime unrolling.
1073   // Don't unroll a runtime trip count loop when it is disabled.
1074   if (hasRuntimeUnrollDisablePragma(L)) {
1075     UP.Count = 0;
1076     return false;
1077   }
1078 
1079   // Don't unroll a small upper bound loop unless user or TTI asked to do so.
1080   if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {
1081     UP.Count = 0;
1082     return false;
1083   }
1084 
1085   // Check if the runtime trip count is too small when profile is available.
1086   if (L->getHeader()->getParent()->hasProfileData()) {
1087     if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
1088       if (*ProfileTripCount < FlatLoopTripCountThreshold)
1089         return false;
1090       else
1091         UP.AllowExpensiveTripCount = true;
1092     }
1093   }
1094   UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
1095   if (!UP.Runtime) {
1096     LLVM_DEBUG(
1097         dbgs() << "  will not try to unroll loop with runtime trip count "
1098                << "-unroll-runtime not given\n");
1099     UP.Count = 0;
1100     return false;
1101   }
1102   if (UP.Count == 0)
1103     UP.Count = UP.DefaultUnrollRuntimeCount;
1104 
1105   // Reduce unroll count to be the largest power-of-two factor of
1106   // the original count which satisfies the threshold limit.
1107   while (UP.Count != 0 &&
1108          UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)
1109     UP.Count >>= 1;
1110 
1111 #ifndef NDEBUG
1112   unsigned OrigCount = UP.Count;
1113 #endif
1114 
1115   if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
1116     while (UP.Count != 0 && TripMultiple % UP.Count != 0)
1117       UP.Count >>= 1;
1118     LLVM_DEBUG(
1119         dbgs() << "Remainder loop is restricted (that could architecture "
1120                   "specific or because the loop contains a convergent "
1121                   "instruction), so unroll count must divide the trip "
1122                   "multiple, "
1123                << TripMultiple << ".  Reducing unroll count from " << OrigCount
1124                << " to " << UP.Count << ".\n");
1125 
1126     using namespace ore;
1127 
1128     if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder)
1129       ORE->emit([&]() {
1130         return OptimizationRemarkMissed(DEBUG_TYPE,
1131                                         "DifferentUnrollCountFromDirected",
1132                                         L->getStartLoc(), L->getHeader())
1133                << "Unable to unroll loop the number of times directed by "
1134                   "unroll_count pragma because remainder loop is restricted "
1135                   "(that could architecture specific or because the loop "
1136                   "contains a convergent instruction) and so must have an "
1137                   "unroll "
1138                   "count that divides the loop trip multiple of "
1139                << NV("TripMultiple", TripMultiple) << ".  Unrolling instead "
1140                << NV("UnrollCount", UP.Count) << " time(s).";
1141       });
1142   }
1143 
1144   if (UP.Count > UP.MaxCount)
1145     UP.Count = UP.MaxCount;
1146 
1147   if (MaxTripCount && UP.Count > MaxTripCount)
1148     UP.Count = MaxTripCount;
1149 
1150   LLVM_DEBUG(dbgs() << "  runtime unrolling with count: " << UP.Count
1151                     << "\n");
1152   if (UP.Count < 2)
1153     UP.Count = 0;
1154   return ExplicitUnroll;
1155 }
1156 
1157 static LoopUnrollResult
1158 tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
1159                 const TargetTransformInfo &TTI, AssumptionCache &AC,
1160                 OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
1161                 ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1162                 bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV,
1163                 std::optional<unsigned> ProvidedCount,
1164                 std::optional<unsigned> ProvidedThreshold,
1165                 std::optional<bool> ProvidedAllowPartial,
1166                 std::optional<bool> ProvidedRuntime,
1167                 std::optional<bool> ProvidedUpperBound,
1168                 std::optional<bool> ProvidedAllowPeeling,
1169                 std::optional<bool> ProvidedAllowProfileBasedPeeling,
1170                 std::optional<unsigned> ProvidedFullUnrollMaxCount,
1171                 AAResults *AA = nullptr) {
1172 
1173   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1174                     << L->getHeader()->getParent()->getName() << "] Loop %"
1175                     << L->getHeader()->getName() << "\n");
1176   TransformationMode TM = hasUnrollTransformation(L);
1177   if (TM & TM_Disable)
1178     return LoopUnrollResult::Unmodified;
1179 
1180   // If this loop isn't forced to be unrolled, avoid unrolling it when the
1181   // parent loop has an explicit unroll-and-jam pragma. This is to prevent
1182   // automatic unrolling from interfering with the user requested
1183   // transformation.
1184   Loop *ParentL = L->getParentLoop();
1185   if (ParentL != nullptr &&
1186       hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
1187       hasUnrollTransformation(L) != TM_ForcedByUser) {
1188     LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
1189                       << " llvm.loop.unroll_and_jam.\n");
1190     return LoopUnrollResult::Unmodified;
1191   }
1192 
1193   // If this loop isn't forced to be unrolled, avoid unrolling it when the
1194   // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
1195   // unrolling from interfering with the user requested transformation.
1196   if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&
1197       hasUnrollTransformation(L) != TM_ForcedByUser) {
1198     LLVM_DEBUG(
1199         dbgs()
1200         << "  Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
1201     return LoopUnrollResult::Unmodified;
1202   }
1203 
1204   if (!L->isLoopSimplifyForm()) {
1205     LLVM_DEBUG(
1206         dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
1207     return LoopUnrollResult::Unmodified;
1208   }
1209 
1210   // When automatic unrolling is disabled, do not unroll unless overridden for
1211   // this loop.
1212   if (OnlyWhenForced && !(TM & TM_Enable))
1213     return LoopUnrollResult::Unmodified;
1214 
1215   bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1216   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
1217       L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,
1218       ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1219       ProvidedFullUnrollMaxCount);
1220   TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
1221       L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
1222 
1223   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1224   // as threshold later on.
1225   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1226       !OptForSize)
1227     return LoopUnrollResult::Unmodified;
1228 
1229   SmallPtrSet<const Value *, 32> EphValues;
1230   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
1231 
1232   UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
1233   if (!UCE.canUnroll()) {
1234     LLVM_DEBUG(dbgs() << "  Loop not considered unrollable.\n");
1235     return LoopUnrollResult::Unmodified;
1236   }
1237 
1238   unsigned LoopSize = UCE.getRolledLoopSize();
1239   LLVM_DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
1240 
1241   // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1242   // later), to (fully) unroll loops, if it does not increase code size.
1243   if (OptForSize)
1244     UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
1245 
1246   if (UCE.NumInlineCandidates != 0) {
1247     LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
1248     return LoopUnrollResult::Unmodified;
1249   }
1250 
1251   // Find the smallest exact trip count for any exit. This is an upper bound
1252   // on the loop trip count, but an exit at an earlier iteration is still
1253   // possible. An unroll by the smallest exact trip count guarantees that all
1254   // branches relating to at least one exit can be eliminated. This is unlike
1255   // the max trip count, which only guarantees that the backedge can be broken.
1256   unsigned TripCount = 0;
1257   unsigned TripMultiple = 1;
1258   SmallVector<BasicBlock *, 8> ExitingBlocks;
1259   L->getExitingBlocks(ExitingBlocks);
1260   for (BasicBlock *ExitingBlock : ExitingBlocks)
1261     if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
1262       if (!TripCount || TC < TripCount)
1263         TripCount = TripMultiple = TC;
1264 
1265   if (!TripCount) {
1266     // If no exact trip count is known, determine the trip multiple of either
1267     // the loop latch or the single exiting block.
1268     // TODO: Relax for multiple exits.
1269     BasicBlock *ExitingBlock = L->getLoopLatch();
1270     if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1271       ExitingBlock = L->getExitingBlock();
1272     if (ExitingBlock)
1273       TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1274   }
1275 
1276   // If the loop contains a convergent operation, the prelude we'd add
1277   // to do the first few instructions before we hit the unrolled loop
1278   // is unsafe -- it adds a control-flow dependency to the convergent
1279   // operation.  Therefore restrict remainder loop (try unrolling without).
1280   //
1281   // TODO: This is somewhat conservative; we could allow the remainder if the
1282   // trip count is uniform.
1283   UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime;
1284 
1285   // Try to find the trip count upper bound if we cannot find the exact trip
1286   // count.
1287   unsigned MaxTripCount = 0;
1288   bool MaxOrZero = false;
1289   if (!TripCount) {
1290     MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1291     MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1292   }
1293 
1294   // computeUnrollCount() decides whether it is beneficial to use upper bound to
1295   // fully unroll the loop.
1296   bool UseUpperBound = false;
1297   bool IsCountSetExplicitly = computeUnrollCount(
1298       L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount,
1299       MaxOrZero, TripMultiple, UCE, UP, PP, UseUpperBound);
1300   if (!UP.Count)
1301     return LoopUnrollResult::Unmodified;
1302 
1303   UP.Runtime &= UCE.ConvergenceAllowsRuntime;
1304 
1305   if (PP.PeelCount) {
1306     assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
1307     LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
1308                       << " with iteration count " << PP.PeelCount << "!\n");
1309     ORE.emit([&]() {
1310       return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
1311                                 L->getHeader())
1312              << " peeled loop by " << ore::NV("PeelCount", PP.PeelCount)
1313              << " iterations";
1314     });
1315 
1316     ValueToValueMapTy VMap;
1317     if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {
1318       simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);
1319       // If the loop was peeled, we already "used up" the profile information
1320       // we had, so we don't want to unroll or peel again.
1321       if (PP.PeelProfiledIterations)
1322         L->setLoopAlreadyUnrolled();
1323       return LoopUnrollResult::PartiallyUnrolled;
1324     }
1325     return LoopUnrollResult::Unmodified;
1326   }
1327 
1328   // Do not attempt partial/runtime unrolling in FullLoopUnrolling
1329   if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) {
1330     LLVM_DEBUG(
1331         dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
1332     return LoopUnrollResult::Unmodified;
1333   }
1334 
1335   // At this point, UP.Runtime indicates that run-time unrolling is allowed.
1336   // However, we only want to actually perform it if we don't know the trip
1337   // count and the unroll count doesn't divide the known trip multiple.
1338   // TODO: This decision should probably be pushed up into
1339   // computeUnrollCount().
1340   UP.Runtime &= TripCount == 0 && TripMultiple % UP.Count != 0;
1341 
1342   // Save loop properties before it is transformed.
1343   MDNode *OrigLoopID = L->getLoopID();
1344 
1345   // Unroll the loop.
1346   Loop *RemainderLoop = nullptr;
1347   UnrollLoopOptions ULO;
1348   ULO.Count = UP.Count;
1349   ULO.Force = UP.Force;
1350   ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount;
1351   ULO.UnrollRemainder = UP.UnrollRemainder;
1352   ULO.Runtime = UP.Runtime;
1353   ULO.ForgetAllSCEV = ForgetAllSCEV;
1354   ULO.Heart = getLoopConvergenceHeart(L);
1355   ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
1356   ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
1357   LoopUnrollResult UnrollResult = UnrollLoop(
1358       L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
1359   if (UnrollResult == LoopUnrollResult::Unmodified)
1360     return LoopUnrollResult::Unmodified;
1361 
1362   if (RemainderLoop) {
1363     std::optional<MDNode *> RemainderLoopID =
1364         makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
1365                                         LLVMLoopUnrollFollowupRemainder});
1366     if (RemainderLoopID)
1367       RemainderLoop->setLoopID(*RemainderLoopID);
1368   }
1369 
1370   if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1371     std::optional<MDNode *> NewLoopID =
1372         makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
1373                                         LLVMLoopUnrollFollowupUnrolled});
1374     if (NewLoopID) {
1375       L->setLoopID(*NewLoopID);
1376 
1377       // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1378       // explicitly.
1379       return UnrollResult;
1380     }
1381   }
1382 
1383   // If loop has an unroll count pragma or unrolled by explicitly set count
1384   // mark loop as unrolled to prevent unrolling beyond that requested.
1385   if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
1386     L->setLoopAlreadyUnrolled();
1387 
1388   return UnrollResult;
1389 }
1390 
1391 namespace {
1392 
1393 class LoopUnroll : public LoopPass {
1394 public:
1395   static char ID; // Pass ID, replacement for typeid
1396 
1397   int OptLevel;
1398 
1399   /// If false, use a cost model to determine whether unrolling of a loop is
1400   /// profitable. If true, only loops that explicitly request unrolling via
1401   /// metadata are considered. All other loops are skipped.
1402   bool OnlyWhenForced;
1403 
1404   /// If false, when SCEV is invalidated, only forget everything in the
1405   /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1406   /// Otherwise, forgetAllLoops and rebuild when needed next.
1407   bool ForgetAllSCEV;
1408 
1409   std::optional<unsigned> ProvidedCount;
1410   std::optional<unsigned> ProvidedThreshold;
1411   std::optional<bool> ProvidedAllowPartial;
1412   std::optional<bool> ProvidedRuntime;
1413   std::optional<bool> ProvidedUpperBound;
1414   std::optional<bool> ProvidedAllowPeeling;
1415   std::optional<bool> ProvidedAllowProfileBasedPeeling;
1416   std::optional<unsigned> ProvidedFullUnrollMaxCount;
1417 
1418   LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1419              bool ForgetAllSCEV = false,
1420              std::optional<unsigned> Threshold = std::nullopt,
1421              std::optional<unsigned> Count = std::nullopt,
1422              std::optional<bool> AllowPartial = std::nullopt,
1423              std::optional<bool> Runtime = std::nullopt,
1424              std::optional<bool> UpperBound = std::nullopt,
1425              std::optional<bool> AllowPeeling = std::nullopt,
1426              std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
1427              std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
1428       : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1429         ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1430         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1431         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1432         ProvidedAllowPeeling(AllowPeeling),
1433         ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1434         ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1435     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
1436   }
1437 
1438   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1439     if (skipLoop(L))
1440       return false;
1441 
1442     Function &F = *L->getHeader()->getParent();
1443 
1444     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1445     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1446     ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1447     const TargetTransformInfo &TTI =
1448         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1449     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1450     // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1451     // pass.  Function analyses need to be preserved across loop transformations
1452     // but ORE cannot be preserved (see comment before the pass definition).
1453     OptimizationRemarkEmitter ORE(&F);
1454     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1455 
1456     LoopUnrollResult Result = tryToUnrollLoop(
1457         L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
1458         /*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
1459         ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
1460         ProvidedUpperBound, ProvidedAllowPeeling,
1461         ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount);
1462 
1463     if (Result == LoopUnrollResult::FullyUnrolled)
1464       LPM.markLoopAsDeleted(*L);
1465 
1466     return Result != LoopUnrollResult::Unmodified;
1467   }
1468 
1469   /// This transformation requires natural loop information & requires that
1470   /// loop preheaders be inserted into the CFG...
1471   void getAnalysisUsage(AnalysisUsage &AU) const override {
1472     AU.addRequired<AssumptionCacheTracker>();
1473     AU.addRequired<TargetTransformInfoWrapperPass>();
1474     // FIXME: Loop passes are required to preserve domtree, and for now we just
1475     // recreate dom info if anything gets unrolled.
1476     getLoopAnalysisUsage(AU);
1477   }
1478 };
1479 
1480 } // end anonymous namespace
1481 
1482 char LoopUnroll::ID = 0;
1483 
1484 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1485 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1486 INITIALIZE_PASS_DEPENDENCY(LoopPass)
1487 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1488 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1489 
1490 Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1491                                  bool ForgetAllSCEV, int Threshold, int Count,
1492                                  int AllowPartial, int Runtime, int UpperBound,
1493                                  int AllowPeeling) {
1494   // TODO: It would make more sense for this function to take the optionals
1495   // directly, but that's dangerous since it would silently break out of tree
1496   // callers.
1497   return new LoopUnroll(
1498       OptLevel, OnlyWhenForced, ForgetAllSCEV,
1499       Threshold == -1 ? std::nullopt : std::optional<unsigned>(Threshold),
1500       Count == -1 ? std::nullopt : std::optional<unsigned>(Count),
1501       AllowPartial == -1 ? std::nullopt : std::optional<bool>(AllowPartial),
1502       Runtime == -1 ? std::nullopt : std::optional<bool>(Runtime),
1503       UpperBound == -1 ? std::nullopt : std::optional<bool>(UpperBound),
1504       AllowPeeling == -1 ? std::nullopt : std::optional<bool>(AllowPeeling));
1505 }
1506 
1507 PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
1508                                           LoopStandardAnalysisResults &AR,
1509                                           LPMUpdater &Updater) {
1510   // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
1511   // pass. Function analyses need to be preserved across loop transformations
1512   // but ORE cannot be preserved (see comment before the pass definition).
1513   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
1514 
1515   // Keep track of the previous loop structure so we can identify new loops
1516   // created by unrolling.
1517   Loop *ParentL = L.getParentLoop();
1518   SmallPtrSet<Loop *, 4> OldLoops;
1519   if (ParentL)
1520     OldLoops.insert(ParentL->begin(), ParentL->end());
1521   else
1522     OldLoops.insert(AR.LI.begin(), AR.LI.end());
1523 
1524   std::string LoopName = std::string(L.getName());
1525 
1526   bool Changed =
1527       tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
1528                       /*BFI*/ nullptr, /*PSI*/ nullptr,
1529                       /*PreserveLCSSA*/ true, OptLevel, /*OnlyFullUnroll*/ true,
1530                       OnlyWhenForced, ForgetSCEV, /*Count*/ std::nullopt,
1531                       /*Threshold*/ std::nullopt, /*AllowPartial*/ false,
1532                       /*Runtime*/ false, /*UpperBound*/ false,
1533                       /*AllowPeeling*/ true,
1534                       /*AllowProfileBasedPeeling*/ false,
1535                       /*FullUnrollMaxCount*/ std::nullopt) !=
1536       LoopUnrollResult::Unmodified;
1537   if (!Changed)
1538     return PreservedAnalyses::all();
1539 
1540   // The parent must not be damaged by unrolling!
1541 #ifndef NDEBUG
1542   if (ParentL)
1543     ParentL->verifyLoop();
1544 #endif
1545 
1546   // Unrolling can do several things to introduce new loops into a loop nest:
1547   // - Full unrolling clones child loops within the current loop but then
1548   //   removes the current loop making all of the children appear to be new
1549   //   sibling loops.
1550   //
1551   // When a new loop appears as a sibling loop after fully unrolling,
1552   // its nesting structure has fundamentally changed and we want to revisit
1553   // it to reflect that.
1554   //
1555   // When unrolling has removed the current loop, we need to tell the
1556   // infrastructure that it is gone.
1557   //
1558   // Finally, we support a debugging/testing mode where we revisit child loops
1559   // as well. These are not expected to require further optimizations as either
1560   // they or the loop they were cloned from have been directly visited already.
1561   // But the debugging mode allows us to check this assumption.
1562   bool IsCurrentLoopValid = false;
1563   SmallVector<Loop *, 4> SibLoops;
1564   if (ParentL)
1565     SibLoops.append(ParentL->begin(), ParentL->end());
1566   else
1567     SibLoops.append(AR.LI.begin(), AR.LI.end());
1568   erase_if(SibLoops, [&](Loop *SibLoop) {
1569     if (SibLoop == &L) {
1570       IsCurrentLoopValid = true;
1571       return true;
1572     }
1573 
1574     // Otherwise erase the loop from the list if it was in the old loops.
1575     return OldLoops.contains(SibLoop);
1576   });
1577   Updater.addSiblingLoops(SibLoops);
1578 
1579   if (!IsCurrentLoopValid) {
1580     Updater.markLoopAsDeleted(L, LoopName);
1581   } else {
1582     // We can only walk child loops if the current loop remained valid.
1583     if (UnrollRevisitChildLoops) {
1584       // Walk *all* of the child loops.
1585       SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1586       Updater.addChildLoops(ChildLoops);
1587     }
1588   }
1589 
1590   return getLoopPassPreservedAnalyses();
1591 }
1592 
1593 PreservedAnalyses LoopUnrollPass::run(Function &F,
1594                                       FunctionAnalysisManager &AM) {
1595   auto &LI = AM.getResult<LoopAnalysis>(F);
1596   // There are no loops in the function. Return before computing other expensive
1597   // analyses.
1598   if (LI.empty())
1599     return PreservedAnalyses::all();
1600   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1601   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
1602   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1603   auto &AC = AM.getResult<AssumptionAnalysis>(F);
1604   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1605   AAResults &AA = AM.getResult<AAManager>(F);
1606 
1607   LoopAnalysisManager *LAM = nullptr;
1608   if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
1609     LAM = &LAMProxy->getManager();
1610 
1611   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1612   ProfileSummaryInfo *PSI =
1613       MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
1614   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1615       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
1616 
1617   bool Changed = false;
1618 
1619   // The unroller requires loops to be in simplified form, and also needs LCSSA.
1620   // Since simplification may add new inner loops, it has to run before the
1621   // legality and profitability checks. This means running the loop unroller
1622   // will simplify all loops, regardless of whether anything end up being
1623   // unrolled.
1624   for (const auto &L : LI) {
1625     Changed |=
1626         simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
1627     Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
1628   }
1629 
1630   // Add the loop nests in the reverse order of LoopInfo. See method
1631   // declaration.
1632   SmallPriorityWorklist<Loop *, 4> Worklist;
1633   appendLoopsToWorklist(LI, Worklist);
1634 
1635   while (!Worklist.empty()) {
1636     // Because the LoopInfo stores the loops in RPO, we walk the worklist
1637     // from back to front so that we work forward across the CFG, which
1638     // for unrolling is only needed to get optimization remarks emitted in
1639     // a forward order.
1640     Loop &L = *Worklist.pop_back_val();
1641 #ifndef NDEBUG
1642     Loop *ParentL = L.getParentLoop();
1643 #endif
1644 
1645     // Check if the profile summary indicates that the profiled application
1646     // has a huge working set size, in which case we disable peeling to avoid
1647     // bloating it further.
1648     std::optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1649     if (PSI && PSI->hasHugeWorkingSetSize())
1650       LocalAllowPeeling = false;
1651     std::string LoopName = std::string(L.getName());
1652     // The API here is quite complex to call and we allow to select some
1653     // flavors of unrolling during construction time (by setting UnrollOpts).
1654     LoopUnrollResult Result = tryToUnrollLoop(
1655         &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
1656         /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*OnlyFullUnroll*/ false,
1657         UnrollOpts.OnlyWhenForced, UnrollOpts.ForgetSCEV,
1658         /*Count*/ std::nullopt,
1659         /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
1660         UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1661         UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,
1662         &AA);
1663     Changed |= Result != LoopUnrollResult::Unmodified;
1664 
1665     // The parent must not be damaged by unrolling!
1666 #ifndef NDEBUG
1667     if (Result != LoopUnrollResult::Unmodified && ParentL)
1668       ParentL->verifyLoop();
1669 #endif
1670 
1671     // Clear any cached analysis results for L if we removed it completely.
1672     if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1673       LAM->clear(L, LoopName);
1674   }
1675 
1676   if (!Changed)
1677     return PreservedAnalyses::all();
1678 
1679   return getLoopPassPreservedAnalyses();
1680 }
1681 
1682 void LoopUnrollPass::printPipeline(
1683     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
1684   static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(
1685       OS, MapClassName2PassName);
1686   OS << '<';
1687   if (UnrollOpts.AllowPartial != std::nullopt)
1688     OS << (*UnrollOpts.AllowPartial ? "" : "no-") << "partial;";
1689   if (UnrollOpts.AllowPeeling != std::nullopt)
1690     OS << (*UnrollOpts.AllowPeeling ? "" : "no-") << "peeling;";
1691   if (UnrollOpts.AllowRuntime != std::nullopt)
1692     OS << (*UnrollOpts.AllowRuntime ? "" : "no-") << "runtime;";
1693   if (UnrollOpts.AllowUpperBound != std::nullopt)
1694     OS << (*UnrollOpts.AllowUpperBound ? "" : "no-") << "upperbound;";
1695   if (UnrollOpts.AllowProfileBasedPeeling != std::nullopt)
1696     OS << (*UnrollOpts.AllowProfileBasedPeeling ? "" : "no-")
1697        << "profile-peeling;";
1698   if (UnrollOpts.FullUnrollMaxCount != std::nullopt)
1699     OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ';';
1700   OS << 'O' << UnrollOpts.OptLevel;
1701   OS << '>';
1702 }
1703