xref: /llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 6c787ff6cfb5fdf489019a1389f8315391ad435f)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanPatternMatch.h"
63 #include "VPlanTransforms.h"
64 #include "VPlanUtils.h"
65 #include "VPlanVerifier.h"
66 #include "llvm/ADT/APInt.h"
67 #include "llvm/ADT/ArrayRef.h"
68 #include "llvm/ADT/DenseMap.h"
69 #include "llvm/ADT/DenseMapInfo.h"
70 #include "llvm/ADT/Hashing.h"
71 #include "llvm/ADT/MapVector.h"
72 #include "llvm/ADT/STLExtras.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/TypeSwitch.h"
79 #include "llvm/ADT/iterator_range.h"
80 #include "llvm/Analysis/AssumptionCache.h"
81 #include "llvm/Analysis/BasicAliasAnalysis.h"
82 #include "llvm/Analysis/BlockFrequencyInfo.h"
83 #include "llvm/Analysis/CFG.h"
84 #include "llvm/Analysis/CodeMetrics.h"
85 #include "llvm/Analysis/DemandedBits.h"
86 #include "llvm/Analysis/GlobalsModRef.h"
87 #include "llvm/Analysis/LoopAccessAnalysis.h"
88 #include "llvm/Analysis/LoopAnalysisManager.h"
89 #include "llvm/Analysis/LoopInfo.h"
90 #include "llvm/Analysis/LoopIterator.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/ValueTracking.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfo.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/MDBuilder.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/PatternMatch.h"
122 #include "llvm/IR/ProfDataUtils.h"
123 #include "llvm/IR/Type.h"
124 #include "llvm/IR/Use.h"
125 #include "llvm/IR/User.h"
126 #include "llvm/IR/Value.h"
127 #include "llvm/IR/Verifier.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/NativeFormatting.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/Local.h"
139 #include "llvm/Transforms/Utils/LoopSimplify.h"
140 #include "llvm/Transforms/Utils/LoopUtils.h"
141 #include "llvm/Transforms/Utils/LoopVersioning.h"
142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143 #include "llvm/Transforms/Utils/SizeOpts.h"
144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <algorithm>
146 #include <cassert>
147 #include <cstdint>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251                    "data-and-control-without-rt-check",
252                    "Similar to data-and-control, but remove the runtime check"),
253         clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254                    "Use predicated EVL instructions for tail folding. If EVL "
255                    "is unsupported, fallback to data-without-lane-mask.")));
256 
257 static cl::opt<bool> MaximizeBandwidth(
258     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259     cl::desc("Maximize bandwidth when selecting vectorization factor which "
260              "will be determined by the smallest type in loop."));
261 
262 static cl::opt<bool> EnableInterleavedMemAccesses(
263     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265 
266 /// An interleave-group may need masking if it resides in a block that needs
267 /// predication, or in order to mask away gaps.
268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
269     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271 
272 static cl::opt<unsigned> ForceTargetNumScalarRegs(
273     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's number of scalar registers."));
275 
276 static cl::opt<unsigned> ForceTargetNumVectorRegs(
277     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278     cl::desc("A flag that overrides the target's number of vector registers."));
279 
280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
281     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282     cl::desc("A flag that overrides the target's max interleave factor for "
283              "scalar loops."));
284 
285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
286     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287     cl::desc("A flag that overrides the target's max interleave factor for "
288              "vectorized loops."));
289 
290 cl::opt<unsigned> ForceTargetInstructionCost(
291     "force-target-instruction-cost", cl::init(0), cl::Hidden,
292     cl::desc("A flag that overrides the target's expected cost for "
293              "an instruction to a single constant value. Mostly "
294              "useful for getting consistent testing."));
295 
296 static cl::opt<bool> ForceTargetSupportsScalableVectors(
297     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298     cl::desc(
299         "Pretend that scalable vectors are supported, even if the target does "
300         "not support them. This flag should only be used for testing."));
301 
302 static cl::opt<unsigned> SmallLoopCost(
303     "small-loop-cost", cl::init(20), cl::Hidden,
304     cl::desc(
305         "The cost of a loop that is considered 'small' by the interleaver."));
306 
307 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
308     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309     cl::desc("Enable the use of the block frequency analysis to access PGO "
310              "heuristics minimizing code growth in cold regions and being more "
311              "aggressive in hot regions."));
312 
313 // Runtime interleave loops for load/store throughput.
314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
315     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316     cl::desc(
317         "Enable runtime interleaving until load/store ports are saturated"));
318 
319 /// The number of stores in a loop that are allowed to need predication.
320 static cl::opt<unsigned> NumberOfStoresToPredicate(
321     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322     cl::desc("Max number of stores to be predicated behind an if."));
323 
324 static cl::opt<bool> EnableIndVarRegisterHeur(
325     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326     cl::desc("Count the induction variable only once when interleaving"));
327 
328 static cl::opt<bool> EnableCondStoresVectorization(
329     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330     cl::desc("Enable if predication of stores during vectorization."));
331 
332 static cl::opt<unsigned> MaxNestedScalarReductionIC(
333     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334     cl::desc("The maximum interleave count to use when interleaving a scalar "
335              "reduction in a nested loop."));
336 
337 static cl::opt<bool>
338     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
339                            cl::Hidden,
340                            cl::desc("Prefer in-loop vector reductions, "
341                                     "overriding the targets preference."));
342 
343 static cl::opt<bool> ForceOrderedReductions(
344     "force-ordered-reductions", cl::init(false), cl::Hidden,
345     cl::desc("Enable the vectorisation of loops with in-order (strict) "
346              "FP reductions"));
347 
348 static cl::opt<bool> PreferPredicatedReductionSelect(
349     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350     cl::desc(
351         "Prefer predicating a reduction operation over an after loop select."));
352 
353 namespace llvm {
354 cl::opt<bool> EnableVPlanNativePath(
355     "enable-vplan-native-path", cl::Hidden,
356     cl::desc("Enable VPlan-native vectorization path with "
357              "support for outer loop vectorization."));
358 } // namespace llvm
359 
360 // This flag enables the stress testing of the VPlan H-CFG construction in the
361 // VPlan-native vectorization path. It must be used in conjuction with
362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363 // verification of the H-CFGs built.
364 static cl::opt<bool> VPlanBuildStressTest(
365     "vplan-build-stress-test", cl::init(false), cl::Hidden,
366     cl::desc(
367         "Build VPlan for every supported loop nest in the function and bail "
368         "out right after the build (stress test the VPlan H-CFG construction "
369         "in the VPlan-native vectorization path)."));
370 
371 cl::opt<bool> llvm::EnableLoopInterleaving(
372     "interleave-loops", cl::init(true), cl::Hidden,
373     cl::desc("Enable loop interleaving in Loop vectorization passes"));
374 cl::opt<bool> llvm::EnableLoopVectorization(
375     "vectorize-loops", cl::init(true), cl::Hidden,
376     cl::desc("Run the Loop vectorization passes"));
377 
378 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
379     "force-widen-divrem-via-safe-divisor", cl::Hidden,
380     cl::desc(
381         "Override cost based safe divisor widening for div/rem instructions"));
382 
383 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
384     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
385     cl::Hidden,
386     cl::desc("Try wider VFs if they enable the use of vector variants"));
387 
388 static cl::opt<bool> EnableEarlyExitVectorization(
389     "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390     cl::desc(
391         "Enable vectorization of early exit loops with uncountable exits."));
392 
393 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394 // variables not overflowing do not hold. See `emitSCEVChecks`.
395 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
397 // `emitMemRuntimeChecks`.
398 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399 // Likelyhood of bypassing the vectorized loop because there are zero trips left
400 // after prolog. See `emitIterationCountCheck`.
401 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402 
403 /// A helper function that returns true if the given type is irregular. The
404 /// type is irregular if its allocated size doesn't equal the store size of an
405 /// element of the corresponding vector type.
406 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407   // Determine if an array of N elements of type Ty is "bitcast compatible"
408   // with a <N x Ty> vector.
409   // This is only true if there is no padding between the array elements.
410   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411 }
412 
413 /// Returns "best known" trip count for the specified loop \p L as defined by
414 /// the following procedure:
415 ///   1) Returns exact trip count if it is known.
416 ///   2) Returns expected trip count according to profile data if any.
417 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
418 ///   4) Returns std::nullopt if all of the above failed.
419 static std::optional<unsigned>
420 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
421                     bool CanUseConstantMax = true) {
422   // Check if exact trip count is known.
423   if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
424     return ExpectedTC;
425 
426   // Check if there is an expected trip count available from profile data.
427   if (LoopVectorizeWithBlockFrequency)
428     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
429       return *EstimatedTC;
430 
431   if (!CanUseConstantMax)
432     return std::nullopt;
433 
434   // Check if upper bound estimate is known.
435   if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
436     return ExpectedTC;
437 
438   return std::nullopt;
439 }
440 
441 namespace {
442 // Forward declare GeneratedRTChecks.
443 class GeneratedRTChecks;
444 
445 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446 } // namespace
447 
448 namespace llvm {
449 
450 AnalysisKey ShouldRunExtraVectorPasses::Key;
451 
452 /// InnerLoopVectorizer vectorizes loops which contain only one basic
453 /// block to a specified vectorization factor (VF).
454 /// This class performs the widening of scalars into vectors, or multiple
455 /// scalars. This class also implements the following features:
456 /// * It inserts an epilogue loop for handling loops that don't have iteration
457 ///   counts that are known to be a multiple of the vectorization factor.
458 /// * It handles the code generation for reduction variables.
459 /// * Scalarization (implementation using scalars) of un-vectorizable
460 ///   instructions.
461 /// InnerLoopVectorizer does not perform any vectorization-legality
462 /// checks, and relies on the caller to check for the different legality
463 /// aspects. The InnerLoopVectorizer relies on the
464 /// LoopVectorizationLegality class to provide information about the induction
465 /// and reduction variables that were found to a given vectorization factor.
466 class InnerLoopVectorizer {
467 public:
468   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
469                       LoopInfo *LI, DominatorTree *DT,
470                       const TargetLibraryInfo *TLI,
471                       const TargetTransformInfo *TTI, AssumptionCache *AC,
472                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
473                       ElementCount MinProfitableTripCount,
474                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
475                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
476                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
477                       VPlan &Plan)
478       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
479         AC(AC), ORE(ORE), VF(VecWidth),
480         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
481         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
482         PSI(PSI), RTChecks(RTChecks), Plan(Plan),
483         VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
484     // Query this against the original loop and save it here because the profile
485     // of the original loop header may change as the transformation happens.
486     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
487         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
488   }
489 
490   virtual ~InnerLoopVectorizer() = default;
491 
492   /// Create a new empty loop that will contain vectorized instructions later
493   /// on, while the old loop will be used as the scalar remainder. Control flow
494   /// is generated around the vectorized (and scalar epilogue) loops consisting
495   /// of various checks and bypasses. Return the pre-header block of the new
496   /// loop. In the case of epilogue vectorization, this function is overriden to
497   /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
498   /// used to look up SCEV expansions for expressions needed during skeleton
499   /// creation.
500   virtual BasicBlock *
501   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502 
503   /// Fix the vectorized code, taking care of header phi's, and more.
504   void fixVectorizedLoop(VPTransformState &State);
505 
506   // Return true if any runtime check is added.
507   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
508 
509   /// A helper function to scalarize a single Instruction in the innermost loop.
510   /// Generates a sequence of scalar instances for each lane between \p MinLane
511   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
512   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
513   /// Instr's operands.
514   void scalarizeInstruction(const Instruction *Instr,
515                             VPReplicateRecipe *RepRecipe, const VPLane &Lane,
516                             VPTransformState &State);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPTransformState &State);
520 
521   /// Returns the original loop trip count.
522   Value *getTripCount() const { return TripCount; }
523 
524   /// Used to set the trip count after ILV's construction and after the
525   /// preheader block has been executed. Note that this always holds the trip
526   /// count of the original loop for both main loop and epilogue vectorization.
527   void setTripCount(Value *TC) { TripCount = TC; }
528 
529   // Retrieve the additional bypass value associated with an original
530   /// induction header phi.
531   Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
532     return Induction2AdditionalBypassValue.at(OrigPhi);
533   }
534 
535   /// Return the additional bypass block which targets the scalar loop by
536   /// skipping the epilogue loop after completing the main loop.
537   BasicBlock *getAdditionalBypassBlock() const {
538     assert(AdditionalBypassBlock &&
539            "Trying to access AdditionalBypassBlock but it has not been set");
540     return AdditionalBypassBlock;
541   }
542 
543 protected:
544   friend class LoopVectorizationPlanner;
545 
546   /// Iteratively sink the scalarized operands of a predicated instruction into
547   /// the block that was created for it.
548   void sinkScalarOperands(Instruction *PredInst);
549 
550   /// Returns (and creates if needed) the trip count of the widened loop.
551   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
552 
553   /// Emit a bypass check to see if the vector trip count is zero, including if
554   /// it overflows.
555   void emitIterationCountCheck(BasicBlock *Bypass);
556 
557   /// Emit a bypass check to see if all of the SCEV assumptions we've
558   /// had to make are correct. Returns the block containing the checks or
559   /// nullptr if no checks have been added.
560   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
561 
562   /// Emit bypass checks to check any memory assumptions we may have made.
563   /// Returns the block containing the checks or nullptr if no checks have been
564   /// added.
565   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
566 
567   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
568   /// vector loop preheader, middle block and scalar preheader.
569   void createVectorLoopSkeleton(StringRef Prefix);
570 
571   /// Create and record the values for induction variables to resume coming from
572   /// the additional bypass block.
573   void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
574                                              Value *MainVectorTripCount);
575 
576   /// Allow subclasses to override and print debug traces before/after vplan
577   /// execution, when trace information is requested.
578   virtual void printDebugTracesAtStart() {}
579   virtual void printDebugTracesAtEnd() {}
580 
581   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
582   /// vector preheader and its predecessor, also connecting the new block to the
583   /// scalar preheader.
584   void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
585 
586   /// The original loop.
587   Loop *OrigLoop;
588 
589   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
590   /// dynamic knowledge to simplify SCEV expressions and converts them to a
591   /// more usable form.
592   PredicatedScalarEvolution &PSE;
593 
594   /// Loop Info.
595   LoopInfo *LI;
596 
597   /// Dominator Tree.
598   DominatorTree *DT;
599 
600   /// Target Library Info.
601   const TargetLibraryInfo *TLI;
602 
603   /// Target Transform Info.
604   const TargetTransformInfo *TTI;
605 
606   /// Assumption Cache.
607   AssumptionCache *AC;
608 
609   /// Interface to emit optimization remarks.
610   OptimizationRemarkEmitter *ORE;
611 
612   /// The vectorization SIMD factor to use. Each vector will have this many
613   /// vector elements.
614   ElementCount VF;
615 
616   ElementCount MinProfitableTripCount;
617 
618   /// The vectorization unroll factor to use. Each scalar is vectorized to this
619   /// many different vector instructions.
620   unsigned UF;
621 
622   /// The builder that we use
623   IRBuilder<> Builder;
624 
625   // --- Vectorization state ---
626 
627   /// The vector-loop preheader.
628   BasicBlock *LoopVectorPreHeader;
629 
630   /// The scalar-loop preheader.
631   BasicBlock *LoopScalarPreHeader;
632 
633   /// Middle Block between the vector and the scalar.
634   BasicBlock *LoopMiddleBlock;
635 
636   /// A list of all bypass blocks. The first block is the entry of the loop.
637   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
638 
639   /// Store instructions that were predicated.
640   SmallVector<Instruction *, 4> PredicatedInstructions;
641 
642   /// Trip count of the original loop.
643   Value *TripCount = nullptr;
644 
645   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
646   Value *VectorTripCount = nullptr;
647 
648   /// The legality analysis.
649   LoopVectorizationLegality *Legal;
650 
651   /// The profitablity analysis.
652   LoopVectorizationCostModel *Cost;
653 
654   // Record whether runtime checks are added.
655   bool AddedSafetyChecks = false;
656 
657   /// BFI and PSI are used to check for profile guided size optimizations.
658   BlockFrequencyInfo *BFI;
659   ProfileSummaryInfo *PSI;
660 
661   // Whether this loop should be optimized for size based on profile guided size
662   // optimizatios.
663   bool OptForSizeBasedOnProfile;
664 
665   /// Structure to hold information about generated runtime checks, responsible
666   /// for cleaning the checks, if vectorization turns out unprofitable.
667   GeneratedRTChecks &RTChecks;
668 
669   /// Mapping of induction phis to their additional bypass values. They
670   /// need to be added as operands to phi nodes in the scalar loop preheader
671   /// after the epilogue skeleton has been created.
672   DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
673 
674   /// The additional bypass block which conditionally skips over the epilogue
675   /// loop after executing the main loop. Needed to resume inductions and
676   /// reductions during epilogue vectorization.
677   BasicBlock *AdditionalBypassBlock = nullptr;
678 
679   VPlan &Plan;
680 
681   /// The vector preheader block of \p Plan, used as target for check blocks
682   /// introduced during skeleton creation.
683   VPBlockBase *VectorPHVPB;
684 };
685 
686 /// Encapsulate information regarding vectorization of a loop and its epilogue.
687 /// This information is meant to be updated and used across two stages of
688 /// epilogue vectorization.
689 struct EpilogueLoopVectorizationInfo {
690   ElementCount MainLoopVF = ElementCount::getFixed(0);
691   unsigned MainLoopUF = 0;
692   ElementCount EpilogueVF = ElementCount::getFixed(0);
693   unsigned EpilogueUF = 0;
694   BasicBlock *MainLoopIterationCountCheck = nullptr;
695   BasicBlock *EpilogueIterationCountCheck = nullptr;
696   BasicBlock *SCEVSafetyCheck = nullptr;
697   BasicBlock *MemSafetyCheck = nullptr;
698   Value *TripCount = nullptr;
699   Value *VectorTripCount = nullptr;
700   VPlan &EpiloguePlan;
701 
702   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
703                                 ElementCount EVF, unsigned EUF,
704                                 VPlan &EpiloguePlan)
705       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
706         EpiloguePlan(EpiloguePlan) {
707     assert(EUF == 1 &&
708            "A high UF for the epilogue loop is likely not beneficial.");
709   }
710 };
711 
712 /// An extension of the inner loop vectorizer that creates a skeleton for a
713 /// vectorized loop that has its epilogue (residual) also vectorized.
714 /// The idea is to run the vplan on a given loop twice, firstly to setup the
715 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
716 /// from the first step and vectorize the epilogue.  This is achieved by
717 /// deriving two concrete strategy classes from this base class and invoking
718 /// them in succession from the loop vectorizer planner.
719 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
720 public:
721   InnerLoopAndEpilogueVectorizer(
722       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
723       DominatorTree *DT, const TargetLibraryInfo *TLI,
724       const TargetTransformInfo *TTI, AssumptionCache *AC,
725       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
726       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
727       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
728       GeneratedRTChecks &Checks, VPlan &Plan)
729       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
730                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
731                             CM, BFI, PSI, Checks, Plan),
732         EPI(EPI) {}
733 
734   // Override this function to handle the more complex control flow around the
735   // three loops.
736   BasicBlock *
737   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
738     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
739   }
740 
741   /// The interface for creating a vectorized skeleton using one of two
742   /// different strategies, each corresponding to one execution of the vplan
743   /// as described above.
744   virtual BasicBlock *
745   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
746 
747   /// Holds and updates state information required to vectorize the main loop
748   /// and its epilogue in two separate passes. This setup helps us avoid
749   /// regenerating and recomputing runtime safety checks. It also helps us to
750   /// shorten the iteration-count-check path length for the cases where the
751   /// iteration count of the loop is so small that the main vector loop is
752   /// completely skipped.
753   EpilogueLoopVectorizationInfo &EPI;
754 };
755 
756 /// A specialized derived class of inner loop vectorizer that performs
757 /// vectorization of *main* loops in the process of vectorizing loops and their
758 /// epilogues.
759 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
760 public:
761   EpilogueVectorizerMainLoop(
762       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
763       DominatorTree *DT, const TargetLibraryInfo *TLI,
764       const TargetTransformInfo *TTI, AssumptionCache *AC,
765       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
766       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
767       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
768       GeneratedRTChecks &Check, VPlan &Plan)
769       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
770                                        EPI, LVL, CM, BFI, PSI, Check, Plan) {}
771   /// Implements the interface for creating a vectorized skeleton using the
772   /// *main loop* strategy (ie the first pass of vplan execution).
773   BasicBlock *
774   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
775 
776 protected:
777   /// Emits an iteration count bypass check once for the main loop (when \p
778   /// ForEpilogue is false) and once for the epilogue loop (when \p
779   /// ForEpilogue is true).
780   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
781   void printDebugTracesAtStart() override;
782   void printDebugTracesAtEnd() override;
783 };
784 
785 // A specialized derived class of inner loop vectorizer that performs
786 // vectorization of *epilogue* loops in the process of vectorizing loops and
787 // their epilogues.
788 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
789 public:
790   EpilogueVectorizerEpilogueLoop(
791       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
792       DominatorTree *DT, const TargetLibraryInfo *TLI,
793       const TargetTransformInfo *TTI, AssumptionCache *AC,
794       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
795       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
796       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
797       GeneratedRTChecks &Checks, VPlan &Plan)
798       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
799                                        EPI, LVL, CM, BFI, PSI, Checks, Plan) {
800     TripCount = EPI.TripCount;
801   }
802   /// Implements the interface for creating a vectorized skeleton using the
803   /// *epilogue loop* strategy (ie the second pass of vplan execution).
804   BasicBlock *
805   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
806 
807 protected:
808   /// Emits an iteration count bypass check after the main vector loop has
809   /// finished to see if there are any iterations left to execute by either
810   /// the vector epilogue or the scalar epilogue.
811   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
812                                                       BasicBlock *Bypass,
813                                                       BasicBlock *Insert);
814   void printDebugTracesAtStart() override;
815   void printDebugTracesAtEnd() override;
816 };
817 } // end namespace llvm
818 
819 /// Look for a meaningful debug location on the instruction or its operands.
820 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
821   if (!I)
822     return DebugLoc();
823 
824   DebugLoc Empty;
825   if (I->getDebugLoc() != Empty)
826     return I->getDebugLoc();
827 
828   for (Use &Op : I->operands()) {
829     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
830       if (OpInst->getDebugLoc() != Empty)
831         return OpInst->getDebugLoc();
832   }
833 
834   return I->getDebugLoc();
835 }
836 
837 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
838 /// is passed, the message relates to that particular instruction.
839 #ifndef NDEBUG
840 static void debugVectorizationMessage(const StringRef Prefix,
841                                       const StringRef DebugMsg,
842                                       Instruction *I) {
843   dbgs() << "LV: " << Prefix << DebugMsg;
844   if (I != nullptr)
845     dbgs() << " " << *I;
846   else
847     dbgs() << '.';
848   dbgs() << '\n';
849 }
850 #endif
851 
852 /// Create an analysis remark that explains why vectorization failed
853 ///
854 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
855 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
856 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
857 /// the location of the remark. If \p DL is passed, use it as debug location for
858 /// the remark. \return the remark object that can be streamed to.
859 static OptimizationRemarkAnalysis
860 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
861                  Instruction *I, DebugLoc DL = {}) {
862   Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
863   // If debug location is attached to the instruction, use it. Otherwise if DL
864   // was not provided, use the loop's.
865   if (I && I->getDebugLoc())
866     DL = I->getDebugLoc();
867   else if (!DL)
868     DL = TheLoop->getStartLoc();
869 
870   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
871 }
872 
873 namespace llvm {
874 
875 /// Return a value for Step multiplied by VF.
876 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
877                        int64_t Step) {
878   assert(Ty->isIntegerTy() && "Expected an integer step");
879   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
880 }
881 
882 /// Return the runtime value for VF.
883 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
884   return B.CreateElementCount(Ty, VF);
885 }
886 
887 void reportVectorizationFailure(const StringRef DebugMsg,
888                                 const StringRef OREMsg, const StringRef ORETag,
889                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
890                                 Instruction *I) {
891   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
892   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
893   ORE->emit(
894       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
895       << "loop not vectorized: " << OREMsg);
896 }
897 
898 /// Reports an informative message: print \p Msg for debugging purposes as well
899 /// as an optimization remark. Uses either \p I as location of the remark, or
900 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
901 /// remark. If \p DL is passed, use it as debug location for the remark.
902 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
903                                     OptimizationRemarkEmitter *ORE,
904                                     Loop *TheLoop, Instruction *I = nullptr,
905                                     DebugLoc DL = {}) {
906   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
907   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
908   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
909                              I, DL)
910             << Msg);
911 }
912 
913 /// Report successful vectorization of the loop. In case an outer loop is
914 /// vectorized, prepend "outer" to the vectorization remark.
915 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
916                                 VectorizationFactor VF, unsigned IC) {
917   LLVM_DEBUG(debugVectorizationMessage(
918       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
919       nullptr));
920   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
921   ORE->emit([&]() {
922     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
923                               TheLoop->getHeader())
924            << "vectorized " << LoopType << "loop (vectorization width: "
925            << ore::NV("VectorizationFactor", VF.Width)
926            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
927   });
928 }
929 
930 } // end namespace llvm
931 
932 namespace llvm {
933 
934 // Loop vectorization cost-model hints how the scalar epilogue loop should be
935 // lowered.
936 enum ScalarEpilogueLowering {
937 
938   // The default: allowing scalar epilogues.
939   CM_ScalarEpilogueAllowed,
940 
941   // Vectorization with OptForSize: don't allow epilogues.
942   CM_ScalarEpilogueNotAllowedOptSize,
943 
944   // A special case of vectorisation with OptForSize: loops with a very small
945   // trip count are considered for vectorization under OptForSize, thereby
946   // making sure the cost of their loop body is dominant, free of runtime
947   // guards and scalar iteration overheads.
948   CM_ScalarEpilogueNotAllowedLowTripLoop,
949 
950   // Loop hint predicate indicating an epilogue is undesired.
951   CM_ScalarEpilogueNotNeededUsePredicate,
952 
953   // Directive indicating we must either tail fold or not vectorize
954   CM_ScalarEpilogueNotAllowedUsePredicate
955 };
956 
957 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
958 
959 /// LoopVectorizationCostModel - estimates the expected speedups due to
960 /// vectorization.
961 /// In many cases vectorization is not profitable. This can happen because of
962 /// a number of reasons. In this class we mainly attempt to predict the
963 /// expected speedup/slowdowns due to the supported instruction set. We use the
964 /// TargetTransformInfo to query the different backends for the cost of
965 /// different operations.
966 class LoopVectorizationCostModel {
967   friend class LoopVectorizationPlanner;
968 
969 public:
970   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
971                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
972                              LoopVectorizationLegality *Legal,
973                              const TargetTransformInfo &TTI,
974                              const TargetLibraryInfo *TLI, DemandedBits *DB,
975                              AssumptionCache *AC,
976                              OptimizationRemarkEmitter *ORE, const Function *F,
977                              const LoopVectorizeHints *Hints,
978                              InterleavedAccessInfo &IAI)
979       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
980         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
981         Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
982 
983   /// \return An upper bound for the vectorization factors (both fixed and
984   /// scalable). If the factors are 0, vectorization and interleaving should be
985   /// avoided up front.
986   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
987 
988   /// \return True if runtime checks are required for vectorization, and false
989   /// otherwise.
990   bool runtimeChecksRequired();
991 
992   /// Setup cost-based decisions for user vectorization factor.
993   /// \return true if the UserVF is a feasible VF to be chosen.
994   bool selectUserVectorizationFactor(ElementCount UserVF) {
995     collectUniformsAndScalars(UserVF);
996     collectInstsToScalarize(UserVF);
997     return expectedCost(UserVF).isValid();
998   }
999 
1000   /// \return The size (in bits) of the smallest and widest types in the code
1001   /// that needs to be vectorized. We ignore values that remain scalar such as
1002   /// 64 bit loop indices.
1003   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1004 
1005   /// \return The desired interleave count.
1006   /// If interleave count has been specified by metadata it will be returned.
1007   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1008   /// are the selected vectorization factor and the cost of the selected VF.
1009   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1010 
1011   /// Memory access instruction may be vectorized in more than one way.
1012   /// Form of instruction after vectorization depends on cost.
1013   /// This function takes cost-based decisions for Load/Store instructions
1014   /// and collects them in a map. This decisions map is used for building
1015   /// the lists of loop-uniform and loop-scalar instructions.
1016   /// The calculated cost is saved with widening decision in order to
1017   /// avoid redundant calculations.
1018   void setCostBasedWideningDecision(ElementCount VF);
1019 
1020   /// A call may be vectorized in different ways depending on whether we have
1021   /// vectorized variants available and whether the target supports masking.
1022   /// This function analyzes all calls in the function at the supplied VF,
1023   /// makes a decision based on the costs of available options, and stores that
1024   /// decision in a map for use in planning and plan execution.
1025   void setVectorizedCallDecision(ElementCount VF);
1026 
1027   /// A struct that represents some properties of the register usage
1028   /// of a loop.
1029   struct RegisterUsage {
1030     /// Holds the number of loop invariant values that are used in the loop.
1031     /// The key is ClassID of target-provided register class.
1032     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1033     /// Holds the maximum number of concurrent live intervals in the loop.
1034     /// The key is ClassID of target-provided register class.
1035     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1036   };
1037 
1038   /// \return Returns information about the register usages of the loop for the
1039   /// given vectorization factors.
1040   SmallVector<RegisterUsage, 8>
1041   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1042 
1043   /// Collect values we want to ignore in the cost model.
1044   void collectValuesToIgnore();
1045 
1046   /// Collect all element types in the loop for which widening is needed.
1047   void collectElementTypesForWidening();
1048 
1049   /// Split reductions into those that happen in the loop, and those that happen
1050   /// outside. In loop reductions are collected into InLoopReductions.
1051   void collectInLoopReductions();
1052 
1053   /// Returns true if we should use strict in-order reductions for the given
1054   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1055   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1056   /// of FP operations.
1057   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1058     return !Hints->allowReordering() && RdxDesc.isOrdered();
1059   }
1060 
1061   /// \returns The smallest bitwidth each instruction can be represented with.
1062   /// The vector equivalents of these instructions should be truncated to this
1063   /// type.
1064   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1065     return MinBWs;
1066   }
1067 
1068   /// \returns True if it is more profitable to scalarize instruction \p I for
1069   /// vectorization factor \p VF.
1070   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1071     assert(VF.isVector() &&
1072            "Profitable to scalarize relevant only for VF > 1.");
1073     assert(
1074         TheLoop->isInnermost() &&
1075         "cost-model should not be used for outer loops (in VPlan-native path)");
1076 
1077     auto Scalars = InstsToScalarize.find(VF);
1078     assert(Scalars != InstsToScalarize.end() &&
1079            "VF not yet analyzed for scalarization profitability");
1080     return Scalars->second.contains(I);
1081   }
1082 
1083   /// Returns true if \p I is known to be uniform after vectorization.
1084   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1085     assert(
1086         TheLoop->isInnermost() &&
1087         "cost-model should not be used for outer loops (in VPlan-native path)");
1088     // Pseudo probe needs to be duplicated for each unrolled iteration and
1089     // vector lane so that profiled loop trip count can be accurately
1090     // accumulated instead of being under counted.
1091     if (isa<PseudoProbeInst>(I))
1092       return false;
1093 
1094     if (VF.isScalar())
1095       return true;
1096 
1097     auto UniformsPerVF = Uniforms.find(VF);
1098     assert(UniformsPerVF != Uniforms.end() &&
1099            "VF not yet analyzed for uniformity");
1100     return UniformsPerVF->second.count(I);
1101   }
1102 
1103   /// Returns true if \p I is known to be scalar after vectorization.
1104   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1105     assert(
1106         TheLoop->isInnermost() &&
1107         "cost-model should not be used for outer loops (in VPlan-native path)");
1108     if (VF.isScalar())
1109       return true;
1110 
1111     auto ScalarsPerVF = Scalars.find(VF);
1112     assert(ScalarsPerVF != Scalars.end() &&
1113            "Scalar values are not calculated for VF");
1114     return ScalarsPerVF->second.count(I);
1115   }
1116 
1117   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1118   /// for vectorization factor \p VF.
1119   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1120     return VF.isVector() && MinBWs.contains(I) &&
1121            !isProfitableToScalarize(I, VF) &&
1122            !isScalarAfterVectorization(I, VF);
1123   }
1124 
1125   /// Decision that was taken during cost calculation for memory instruction.
1126   enum InstWidening {
1127     CM_Unknown,
1128     CM_Widen,         // For consecutive accesses with stride +1.
1129     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1130     CM_Interleave,
1131     CM_GatherScatter,
1132     CM_Scalarize,
1133     CM_VectorCall,
1134     CM_IntrinsicCall
1135   };
1136 
1137   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1138   /// instruction \p I and vector width \p VF.
1139   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1140                            InstructionCost Cost) {
1141     assert(VF.isVector() && "Expected VF >=2");
1142     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1143   }
1144 
1145   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1146   /// interleaving group \p Grp and vector width \p VF.
1147   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1148                            ElementCount VF, InstWidening W,
1149                            InstructionCost Cost) {
1150     assert(VF.isVector() && "Expected VF >=2");
1151     /// Broadcast this decicion to all instructions inside the group.
1152     /// When interleaving, the cost will only be assigned one instruction, the
1153     /// insert position. For other cases, add the appropriate fraction of the
1154     /// total cost to each instruction. This ensures accurate costs are used,
1155     /// even if the insert position instruction is not used.
1156     InstructionCost InsertPosCost = Cost;
1157     InstructionCost OtherMemberCost = 0;
1158     if (W != CM_Interleave)
1159       OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1160     ;
1161     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1162       if (auto *I = Grp->getMember(Idx)) {
1163         if (Grp->getInsertPos() == I)
1164           WideningDecisions[std::make_pair(I, VF)] =
1165               std::make_pair(W, InsertPosCost);
1166         else
1167           WideningDecisions[std::make_pair(I, VF)] =
1168               std::make_pair(W, OtherMemberCost);
1169       }
1170     }
1171   }
1172 
1173   /// Return the cost model decision for the given instruction \p I and vector
1174   /// width \p VF. Return CM_Unknown if this instruction did not pass
1175   /// through the cost modeling.
1176   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1177     assert(VF.isVector() && "Expected VF to be a vector VF");
1178     assert(
1179         TheLoop->isInnermost() &&
1180         "cost-model should not be used for outer loops (in VPlan-native path)");
1181 
1182     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1183     auto Itr = WideningDecisions.find(InstOnVF);
1184     if (Itr == WideningDecisions.end())
1185       return CM_Unknown;
1186     return Itr->second.first;
1187   }
1188 
1189   /// Return the vectorization cost for the given instruction \p I and vector
1190   /// width \p VF.
1191   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1192     assert(VF.isVector() && "Expected VF >=2");
1193     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1194     assert(WideningDecisions.contains(InstOnVF) &&
1195            "The cost is not calculated");
1196     return WideningDecisions[InstOnVF].second;
1197   }
1198 
1199   struct CallWideningDecision {
1200     InstWidening Kind;
1201     Function *Variant;
1202     Intrinsic::ID IID;
1203     std::optional<unsigned> MaskPos;
1204     InstructionCost Cost;
1205   };
1206 
1207   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1208                                Function *Variant, Intrinsic::ID IID,
1209                                std::optional<unsigned> MaskPos,
1210                                InstructionCost Cost) {
1211     assert(!VF.isScalar() && "Expected vector VF");
1212     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1213                                                      MaskPos, Cost};
1214   }
1215 
1216   CallWideningDecision getCallWideningDecision(CallInst *CI,
1217                                                ElementCount VF) const {
1218     assert(!VF.isScalar() && "Expected vector VF");
1219     return CallWideningDecisions.at(std::make_pair(CI, VF));
1220   }
1221 
1222   /// Return True if instruction \p I is an optimizable truncate whose operand
1223   /// is an induction variable. Such a truncate will be removed by adding a new
1224   /// induction variable with the destination type.
1225   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1226     // If the instruction is not a truncate, return false.
1227     auto *Trunc = dyn_cast<TruncInst>(I);
1228     if (!Trunc)
1229       return false;
1230 
1231     // Get the source and destination types of the truncate.
1232     Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1233     Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1234 
1235     // If the truncate is free for the given types, return false. Replacing a
1236     // free truncate with an induction variable would add an induction variable
1237     // update instruction to each iteration of the loop. We exclude from this
1238     // check the primary induction variable since it will need an update
1239     // instruction regardless.
1240     Value *Op = Trunc->getOperand(0);
1241     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1242       return false;
1243 
1244     // If the truncated value is not an induction variable, return false.
1245     return Legal->isInductionPhi(Op);
1246   }
1247 
1248   /// Collects the instructions to scalarize for each predicated instruction in
1249   /// the loop.
1250   void collectInstsToScalarize(ElementCount VF);
1251 
1252   /// Collect Uniform and Scalar values for the given \p VF.
1253   /// The sets depend on CM decision for Load/Store instructions
1254   /// that may be vectorized as interleave, gather-scatter or scalarized.
1255   /// Also make a decision on what to do about call instructions in the loop
1256   /// at that VF -- scalarize, call a known vector routine, or call a
1257   /// vector intrinsic.
1258   void collectUniformsAndScalars(ElementCount VF) {
1259     // Do the analysis once.
1260     if (VF.isScalar() || Uniforms.contains(VF))
1261       return;
1262     setCostBasedWideningDecision(VF);
1263     collectLoopUniforms(VF);
1264     setVectorizedCallDecision(VF);
1265     collectLoopScalars(VF);
1266   }
1267 
1268   /// Returns true if the target machine supports masked store operation
1269   /// for the given \p DataType and kind of access to \p Ptr.
1270   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1271     return Legal->isConsecutivePtr(DataType, Ptr) &&
1272            TTI.isLegalMaskedStore(DataType, Alignment);
1273   }
1274 
1275   /// Returns true if the target machine supports masked load operation
1276   /// for the given \p DataType and kind of access to \p Ptr.
1277   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1278     return Legal->isConsecutivePtr(DataType, Ptr) &&
1279            TTI.isLegalMaskedLoad(DataType, Alignment);
1280   }
1281 
1282   /// Returns true if the target machine can represent \p V as a masked gather
1283   /// or scatter operation.
1284   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1285     bool LI = isa<LoadInst>(V);
1286     bool SI = isa<StoreInst>(V);
1287     if (!LI && !SI)
1288       return false;
1289     auto *Ty = getLoadStoreType(V);
1290     Align Align = getLoadStoreAlignment(V);
1291     if (VF.isVector())
1292       Ty = VectorType::get(Ty, VF);
1293     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1294            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1295   }
1296 
1297   /// Returns true if the target machine supports all of the reduction
1298   /// variables found for the given VF.
1299   bool canVectorizeReductions(ElementCount VF) const {
1300     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1301       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1302       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1303     }));
1304   }
1305 
1306   /// Given costs for both strategies, return true if the scalar predication
1307   /// lowering should be used for div/rem.  This incorporates an override
1308   /// option so it is not simply a cost comparison.
1309   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1310                                      InstructionCost SafeDivisorCost) const {
1311     switch (ForceSafeDivisor) {
1312     case cl::BOU_UNSET:
1313       return ScalarCost < SafeDivisorCost;
1314     case cl::BOU_TRUE:
1315       return false;
1316     case cl::BOU_FALSE:
1317       return true;
1318     }
1319     llvm_unreachable("impossible case value");
1320   }
1321 
1322   /// Returns true if \p I is an instruction which requires predication and
1323   /// for which our chosen predication strategy is scalarization (i.e. we
1324   /// don't have an alternate strategy such as masking available).
1325   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1326   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1327 
1328   /// Returns true if \p I is an instruction that needs to be predicated
1329   /// at runtime.  The result is independent of the predication mechanism.
1330   /// Superset of instructions that return true for isScalarWithPredication.
1331   bool isPredicatedInst(Instruction *I) const;
1332 
1333   /// Return the costs for our two available strategies for lowering a
1334   /// div/rem operation which requires speculating at least one lane.
1335   /// First result is for scalarization (will be invalid for scalable
1336   /// vectors); second is for the safe-divisor strategy.
1337   std::pair<InstructionCost, InstructionCost>
1338   getDivRemSpeculationCost(Instruction *I,
1339                            ElementCount VF) const;
1340 
1341   /// Returns true if \p I is a memory instruction with consecutive memory
1342   /// access that can be widened.
1343   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1344 
1345   /// Returns true if \p I is a memory instruction in an interleaved-group
1346   /// of memory accesses that can be vectorized with wide vector loads/stores
1347   /// and shuffles.
1348   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1349 
1350   /// Check if \p Instr belongs to any interleaved access group.
1351   bool isAccessInterleaved(Instruction *Instr) const {
1352     return InterleaveInfo.isInterleaved(Instr);
1353   }
1354 
1355   /// Get the interleaved access group that \p Instr belongs to.
1356   const InterleaveGroup<Instruction> *
1357   getInterleavedAccessGroup(Instruction *Instr) const {
1358     return InterleaveInfo.getInterleaveGroup(Instr);
1359   }
1360 
1361   /// Returns true if we're required to use a scalar epilogue for at least
1362   /// the final iteration of the original loop.
1363   bool requiresScalarEpilogue(bool IsVectorizing) const {
1364     if (!isScalarEpilogueAllowed()) {
1365       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1366       return false;
1367     }
1368     // If we might exit from anywhere but the latch and early exit vectorization
1369     // is disabled, we must run the exiting iteration in scalar form.
1370     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1371         !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1372       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1373                            "from latch block\n");
1374       return true;
1375     }
1376     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1377       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1378                            "interleaved group requires scalar epilogue\n");
1379       return true;
1380     }
1381     LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1382     return false;
1383   }
1384 
1385   /// Returns true if we're required to use a scalar epilogue for at least
1386   /// the final iteration of the original loop for all VFs in \p Range.
1387   /// A scalar epilogue must either be required for all VFs in \p Range or for
1388   /// none.
1389   bool requiresScalarEpilogue(VFRange Range) const {
1390     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1391       return requiresScalarEpilogue(VF.isVector());
1392     };
1393     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1394     assert(
1395         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1396         "all VFs in range must agree on whether a scalar epilogue is required");
1397     return IsRequired;
1398   }
1399 
1400   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1401   /// loop hint annotation.
1402   bool isScalarEpilogueAllowed() const {
1403     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1404   }
1405 
1406   /// Returns the TailFoldingStyle that is best for the current loop.
1407   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1408     if (!ChosenTailFoldingStyle)
1409       return TailFoldingStyle::None;
1410     return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1411                                : ChosenTailFoldingStyle->second;
1412   }
1413 
1414   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1415   /// overflow or not.
1416   /// \param IsScalableVF true if scalable vector factors enabled.
1417   /// \param UserIC User specific interleave count.
1418   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1419     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1420     if (!Legal->canFoldTailByMasking()) {
1421       ChosenTailFoldingStyle =
1422           std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1423       return;
1424     }
1425 
1426     if (!ForceTailFoldingStyle.getNumOccurrences()) {
1427       ChosenTailFoldingStyle = std::make_pair(
1428           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1429           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1430       return;
1431     }
1432 
1433     // Set styles when forced.
1434     ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1435                                             ForceTailFoldingStyle.getValue());
1436     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1437       return;
1438     // Override forced styles if needed.
1439     // FIXME: use actual opcode/data type for analysis here.
1440     // FIXME: Investigate opportunity for fixed vector factor.
1441     // FIXME: support fixed-order recurrences by fixing splice of non VFxUF
1442     // penultimate EVL.
1443     bool EVLIsLegal =
1444         UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1445         !EnableVPlanNativePath && Legal->getFixedOrderRecurrences().empty();
1446     if (!EVLIsLegal) {
1447       // If for some reason EVL mode is unsupported, fallback to
1448       // DataWithoutLaneMask to try to vectorize the loop with folded tail
1449       // in a generic way.
1450       ChosenTailFoldingStyle =
1451           std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1452                          TailFoldingStyle::DataWithoutLaneMask);
1453       LLVM_DEBUG(
1454           dbgs()
1455           << "LV: Preference for VP intrinsics indicated. Will "
1456              "not try to generate VP Intrinsics "
1457           << (UserIC > 1
1458                   ? "since interleave count specified is greater than 1.\n"
1459                   : "due to non-interleaving reasons.\n"));
1460     }
1461   }
1462 
1463   /// Returns true if all loop blocks should be masked to fold tail loop.
1464   bool foldTailByMasking() const {
1465     // TODO: check if it is possible to check for None style independent of
1466     // IVUpdateMayOverflow flag in getTailFoldingStyle.
1467     return getTailFoldingStyle() != TailFoldingStyle::None;
1468   }
1469 
1470   /// Return maximum safe number of elements to be processed per vector
1471   /// iteration, which do not prevent store-load forwarding and are safe with
1472   /// regard to the memory dependencies. Required for EVL-based VPlans to
1473   /// correctly calculate AVL (application vector length) as min(remaining AVL,
1474   /// MaxSafeElements).
1475   /// TODO: need to consider adjusting cost model to use this value as a
1476   /// vectorization factor for EVL-based vectorization.
1477   std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1478 
1479   /// Returns true if the instructions in this block requires predication
1480   /// for any reason, e.g. because tail folding now requires a predicate
1481   /// or because the block in the original loop was predicated.
1482   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1483     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1484   }
1485 
1486   /// Returns true if VP intrinsics with explicit vector length support should
1487   /// be generated in the tail folded loop.
1488   bool foldTailWithEVL() const {
1489     return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1490   }
1491 
1492   /// Returns true if the Phi is part of an inloop reduction.
1493   bool isInLoopReduction(PHINode *Phi) const {
1494     return InLoopReductions.contains(Phi);
1495   }
1496 
1497   /// Returns true if the predicated reduction select should be used to set the
1498   /// incoming value for the reduction phi.
1499   bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1500     // Force to use predicated reduction select since the EVL of the
1501     // second-to-last iteration might not be VF*UF.
1502     if (foldTailWithEVL())
1503       return true;
1504     return PreferPredicatedReductionSelect ||
1505            TTI.preferPredicatedReductionSelect(
1506                Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1507   }
1508 
1509   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1510   /// with factor VF.  Return the cost of the instruction, including
1511   /// scalarization overhead if it's needed.
1512   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1513 
1514   /// Estimate cost of a call instruction CI if it were vectorized with factor
1515   /// VF. Return the cost of the instruction, including scalarization overhead
1516   /// if it's needed.
1517   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1518 
1519   /// Invalidates decisions already taken by the cost model.
1520   void invalidateCostModelingDecisions() {
1521     WideningDecisions.clear();
1522     CallWideningDecisions.clear();
1523     Uniforms.clear();
1524     Scalars.clear();
1525   }
1526 
1527   /// Returns the expected execution cost. The unit of the cost does
1528   /// not matter because we use the 'cost' units to compare different
1529   /// vector widths. The cost that is returned is *not* normalized by
1530   /// the factor width.
1531   InstructionCost expectedCost(ElementCount VF);
1532 
1533   bool hasPredStores() const { return NumPredStores > 0; }
1534 
1535   /// Returns true if epilogue vectorization is considered profitable, and
1536   /// false otherwise.
1537   /// \p VF is the vectorization factor chosen for the original loop.
1538   /// \p Multiplier is an aditional scaling factor applied to VF before
1539   /// comparing to EpilogueVectorizationMinVF.
1540   bool isEpilogueVectorizationProfitable(const ElementCount VF,
1541                                          const unsigned IC) const;
1542 
1543   /// Returns the execution time cost of an instruction for a given vector
1544   /// width. Vector width of one means scalar.
1545   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1546 
1547   /// Return the cost of instructions in an inloop reduction pattern, if I is
1548   /// part of that pattern.
1549   std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1550                                                          ElementCount VF,
1551                                                          Type *VectorTy) const;
1552 
1553   /// Returns true if \p Op should be considered invariant and if it is
1554   /// trivially hoistable.
1555   bool shouldConsiderInvariant(Value *Op);
1556 
1557 private:
1558   unsigned NumPredStores = 0;
1559 
1560   /// \return An upper bound for the vectorization factors for both
1561   /// fixed and scalable vectorization, where the minimum-known number of
1562   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1563   /// disabled or unsupported, then the scalable part will be equal to
1564   /// ElementCount::getScalable(0).
1565   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1566                                            ElementCount UserVF,
1567                                            bool FoldTailByMasking);
1568 
1569   /// \return the maximized element count based on the targets vector
1570   /// registers and the loop trip-count, but limited to a maximum safe VF.
1571   /// This is a helper function of computeFeasibleMaxVF.
1572   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1573                                        unsigned SmallestType,
1574                                        unsigned WidestType,
1575                                        ElementCount MaxSafeVF,
1576                                        bool FoldTailByMasking);
1577 
1578   /// Checks if scalable vectorization is supported and enabled. Caches the
1579   /// result to avoid repeated debug dumps for repeated queries.
1580   bool isScalableVectorizationAllowed();
1581 
1582   /// \return the maximum legal scalable VF, based on the safe max number
1583   /// of elements.
1584   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1585 
1586   /// Calculate vectorization cost of memory instruction \p I.
1587   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1588 
1589   /// The cost computation for scalarized memory instruction.
1590   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1591 
1592   /// The cost computation for interleaving group of memory instructions.
1593   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1594 
1595   /// The cost computation for Gather/Scatter instruction.
1596   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1597 
1598   /// The cost computation for widening instruction \p I with consecutive
1599   /// memory access.
1600   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1601 
1602   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1603   /// Load: scalar load + broadcast.
1604   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1605   /// element)
1606   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1607 
1608   /// Estimate the overhead of scalarizing an instruction. This is a
1609   /// convenience wrapper for the type-based getScalarizationOverhead API.
1610   InstructionCost getScalarizationOverhead(Instruction *I,
1611                                            ElementCount VF) const;
1612 
1613   /// Returns true if an artificially high cost for emulated masked memrefs
1614   /// should be used.
1615   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1616 
1617   /// Map of scalar integer values to the smallest bitwidth they can be legally
1618   /// represented as. The vector equivalents of these values should be truncated
1619   /// to this type.
1620   MapVector<Instruction *, uint64_t> MinBWs;
1621 
1622   /// A type representing the costs for instructions if they were to be
1623   /// scalarized rather than vectorized. The entries are Instruction-Cost
1624   /// pairs.
1625   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1626 
1627   /// A set containing all BasicBlocks that are known to present after
1628   /// vectorization as a predicated block.
1629   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1630       PredicatedBBsAfterVectorization;
1631 
1632   /// Records whether it is allowed to have the original scalar loop execute at
1633   /// least once. This may be needed as a fallback loop in case runtime
1634   /// aliasing/dependence checks fail, or to handle the tail/remainder
1635   /// iterations when the trip count is unknown or doesn't divide by the VF,
1636   /// or as a peel-loop to handle gaps in interleave-groups.
1637   /// Under optsize and when the trip count is very small we don't allow any
1638   /// iterations to execute in the scalar loop.
1639   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1640 
1641   /// Control finally chosen tail folding style. The first element is used if
1642   /// the IV update may overflow, the second element - if it does not.
1643   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1644       ChosenTailFoldingStyle;
1645 
1646   /// true if scalable vectorization is supported and enabled.
1647   std::optional<bool> IsScalableVectorizationAllowed;
1648 
1649   /// Maximum safe number of elements to be processed per vector iteration,
1650   /// which do not prevent store-load forwarding and are safe with regard to the
1651   /// memory dependencies. Required for EVL-based veectorization, where this
1652   /// value is used as the upper bound of the safe AVL.
1653   std::optional<unsigned> MaxSafeElements;
1654 
1655   /// A map holding scalar costs for different vectorization factors. The
1656   /// presence of a cost for an instruction in the mapping indicates that the
1657   /// instruction will be scalarized when vectorizing with the associated
1658   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1659   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1660 
1661   /// Holds the instructions known to be uniform after vectorization.
1662   /// The data is collected per VF.
1663   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1664 
1665   /// Holds the instructions known to be scalar after vectorization.
1666   /// The data is collected per VF.
1667   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1668 
1669   /// Holds the instructions (address computations) that are forced to be
1670   /// scalarized.
1671   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1672 
1673   /// PHINodes of the reductions that should be expanded in-loop.
1674   SmallPtrSet<PHINode *, 4> InLoopReductions;
1675 
1676   /// A Map of inloop reduction operations and their immediate chain operand.
1677   /// FIXME: This can be removed once reductions can be costed correctly in
1678   /// VPlan. This was added to allow quick lookup of the inloop operations.
1679   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1680 
1681   /// Returns the expected difference in cost from scalarizing the expression
1682   /// feeding a predicated instruction \p PredInst. The instructions to
1683   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1684   /// non-negative return value implies the expression will be scalarized.
1685   /// Currently, only single-use chains are considered for scalarization.
1686   InstructionCost computePredInstDiscount(Instruction *PredInst,
1687                                           ScalarCostsTy &ScalarCosts,
1688                                           ElementCount VF);
1689 
1690   /// Collect the instructions that are uniform after vectorization. An
1691   /// instruction is uniform if we represent it with a single scalar value in
1692   /// the vectorized loop corresponding to each vector iteration. Examples of
1693   /// uniform instructions include pointer operands of consecutive or
1694   /// interleaved memory accesses. Note that although uniformity implies an
1695   /// instruction will be scalar, the reverse is not true. In general, a
1696   /// scalarized instruction will be represented by VF scalar values in the
1697   /// vectorized loop, each corresponding to an iteration of the original
1698   /// scalar loop.
1699   void collectLoopUniforms(ElementCount VF);
1700 
1701   /// Collect the instructions that are scalar after vectorization. An
1702   /// instruction is scalar if it is known to be uniform or will be scalarized
1703   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1704   /// to the list if they are used by a load/store instruction that is marked as
1705   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1706   /// VF values in the vectorized loop, each corresponding to an iteration of
1707   /// the original scalar loop.
1708   void collectLoopScalars(ElementCount VF);
1709 
1710   /// Keeps cost model vectorization decision and cost for instructions.
1711   /// Right now it is used for memory instructions only.
1712   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1713                                 std::pair<InstWidening, InstructionCost>>;
1714 
1715   DecisionList WideningDecisions;
1716 
1717   using CallDecisionList =
1718       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1719 
1720   CallDecisionList CallWideningDecisions;
1721 
1722   /// Returns true if \p V is expected to be vectorized and it needs to be
1723   /// extracted.
1724   bool needsExtract(Value *V, ElementCount VF) const {
1725     Instruction *I = dyn_cast<Instruction>(V);
1726     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1727         TheLoop->isLoopInvariant(I) ||
1728         getWideningDecision(I, VF) == CM_Scalarize)
1729       return false;
1730 
1731     // Assume we can vectorize V (and hence we need extraction) if the
1732     // scalars are not computed yet. This can happen, because it is called
1733     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1734     // the scalars are collected. That should be a safe assumption in most
1735     // cases, because we check if the operands have vectorizable types
1736     // beforehand in LoopVectorizationLegality.
1737     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1738   };
1739 
1740   /// Returns a range containing only operands needing to be extracted.
1741   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1742                                                    ElementCount VF) const {
1743     return SmallVector<Value *, 4>(make_filter_range(
1744         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1745   }
1746 
1747 public:
1748   /// The loop that we evaluate.
1749   Loop *TheLoop;
1750 
1751   /// Predicated scalar evolution analysis.
1752   PredicatedScalarEvolution &PSE;
1753 
1754   /// Loop Info analysis.
1755   LoopInfo *LI;
1756 
1757   /// Vectorization legality.
1758   LoopVectorizationLegality *Legal;
1759 
1760   /// Vector target information.
1761   const TargetTransformInfo &TTI;
1762 
1763   /// Target Library Info.
1764   const TargetLibraryInfo *TLI;
1765 
1766   /// Demanded bits analysis.
1767   DemandedBits *DB;
1768 
1769   /// Assumption cache.
1770   AssumptionCache *AC;
1771 
1772   /// Interface to emit optimization remarks.
1773   OptimizationRemarkEmitter *ORE;
1774 
1775   const Function *TheFunction;
1776 
1777   /// Loop Vectorize Hint.
1778   const LoopVectorizeHints *Hints;
1779 
1780   /// The interleave access information contains groups of interleaved accesses
1781   /// with the same stride and close to each other.
1782   InterleavedAccessInfo &InterleaveInfo;
1783 
1784   /// Values to ignore in the cost model.
1785   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1786 
1787   /// Values to ignore in the cost model when VF > 1.
1788   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1789 
1790   /// All element types found in the loop.
1791   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1792 
1793   /// The kind of cost that we are calculating
1794   TTI::TargetCostKind CostKind;
1795 };
1796 } // end namespace llvm
1797 
1798 namespace {
1799 /// Helper struct to manage generating runtime checks for vectorization.
1800 ///
1801 /// The runtime checks are created up-front in temporary blocks to allow better
1802 /// estimating the cost and un-linked from the existing IR. After deciding to
1803 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1804 /// temporary blocks are completely removed.
1805 class GeneratedRTChecks {
1806   /// Basic block which contains the generated SCEV checks, if any.
1807   BasicBlock *SCEVCheckBlock = nullptr;
1808 
1809   /// The value representing the result of the generated SCEV checks. If it is
1810   /// nullptr, either no SCEV checks have been generated or they have been used.
1811   Value *SCEVCheckCond = nullptr;
1812 
1813   /// Basic block which contains the generated memory runtime checks, if any.
1814   BasicBlock *MemCheckBlock = nullptr;
1815 
1816   /// The value representing the result of the generated memory runtime checks.
1817   /// If it is nullptr, either no memory runtime checks have been generated or
1818   /// they have been used.
1819   Value *MemRuntimeCheckCond = nullptr;
1820 
1821   DominatorTree *DT;
1822   LoopInfo *LI;
1823   TargetTransformInfo *TTI;
1824 
1825   SCEVExpander SCEVExp;
1826   SCEVExpander MemCheckExp;
1827 
1828   bool CostTooHigh = false;
1829   const bool AddBranchWeights;
1830 
1831   Loop *OuterLoop = nullptr;
1832 
1833   PredicatedScalarEvolution &PSE;
1834 
1835   /// The kind of cost that we are calculating
1836   TTI::TargetCostKind CostKind;
1837 
1838 public:
1839   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1840                     LoopInfo *LI, TargetTransformInfo *TTI,
1841                     const DataLayout &DL, bool AddBranchWeights,
1842                     TTI::TargetCostKind CostKind)
1843       : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1844         MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1845         AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1846 
1847   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1848   /// accurately estimate the cost of the runtime checks. The blocks are
1849   /// un-linked from the IR and are added back during vector code generation. If
1850   /// there is no vector code generation, the check blocks are removed
1851   /// completely.
1852   void create(Loop *L, const LoopAccessInfo &LAI,
1853               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1854 
1855     // Hard cutoff to limit compile-time increase in case a very large number of
1856     // runtime checks needs to be generated.
1857     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1858     // profile info.
1859     CostTooHigh =
1860         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1861     if (CostTooHigh)
1862       return;
1863 
1864     BasicBlock *LoopHeader = L->getHeader();
1865     BasicBlock *Preheader = L->getLoopPreheader();
1866 
1867     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1868     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1869     // may be used by SCEVExpander. The blocks will be un-linked from their
1870     // predecessors and removed from LI & DT at the end of the function.
1871     if (!UnionPred.isAlwaysTrue()) {
1872       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1873                                   nullptr, "vector.scevcheck");
1874 
1875       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1876           &UnionPred, SCEVCheckBlock->getTerminator());
1877     }
1878 
1879     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1880     if (RtPtrChecking.Need) {
1881       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1882       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1883                                  "vector.memcheck");
1884 
1885       auto DiffChecks = RtPtrChecking.getDiffChecks();
1886       if (DiffChecks) {
1887         Value *RuntimeVF = nullptr;
1888         MemRuntimeCheckCond = addDiffRuntimeChecks(
1889             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1890             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1891               if (!RuntimeVF)
1892                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1893               return RuntimeVF;
1894             },
1895             IC);
1896       } else {
1897         MemRuntimeCheckCond = addRuntimeChecks(
1898             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1899             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1900       }
1901       assert(MemRuntimeCheckCond &&
1902              "no RT checks generated although RtPtrChecking "
1903              "claimed checks are required");
1904     }
1905 
1906     if (!MemCheckBlock && !SCEVCheckBlock)
1907       return;
1908 
1909     // Unhook the temporary block with the checks, update various places
1910     // accordingly.
1911     if (SCEVCheckBlock)
1912       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1913     if (MemCheckBlock)
1914       MemCheckBlock->replaceAllUsesWith(Preheader);
1915 
1916     if (SCEVCheckBlock) {
1917       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1918       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1919       Preheader->getTerminator()->eraseFromParent();
1920     }
1921     if (MemCheckBlock) {
1922       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1923       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1924       Preheader->getTerminator()->eraseFromParent();
1925     }
1926 
1927     DT->changeImmediateDominator(LoopHeader, Preheader);
1928     if (MemCheckBlock) {
1929       DT->eraseNode(MemCheckBlock);
1930       LI->removeBlock(MemCheckBlock);
1931     }
1932     if (SCEVCheckBlock) {
1933       DT->eraseNode(SCEVCheckBlock);
1934       LI->removeBlock(SCEVCheckBlock);
1935     }
1936 
1937     // Outer loop is used as part of the later cost calculations.
1938     OuterLoop = L->getParentLoop();
1939   }
1940 
1941   InstructionCost getCost() {
1942     if (SCEVCheckBlock || MemCheckBlock)
1943       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1944 
1945     if (CostTooHigh) {
1946       InstructionCost Cost;
1947       Cost.setInvalid();
1948       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1949       return Cost;
1950     }
1951 
1952     InstructionCost RTCheckCost = 0;
1953     if (SCEVCheckBlock)
1954       for (Instruction &I : *SCEVCheckBlock) {
1955         if (SCEVCheckBlock->getTerminator() == &I)
1956           continue;
1957         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1958         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1959         RTCheckCost += C;
1960       }
1961     if (MemCheckBlock) {
1962       InstructionCost MemCheckCost = 0;
1963       for (Instruction &I : *MemCheckBlock) {
1964         if (MemCheckBlock->getTerminator() == &I)
1965           continue;
1966         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1967         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1968         MemCheckCost += C;
1969       }
1970 
1971       // If the runtime memory checks are being created inside an outer loop
1972       // we should find out if these checks are outer loop invariant. If so,
1973       // the checks will likely be hoisted out and so the effective cost will
1974       // reduce according to the outer loop trip count.
1975       if (OuterLoop) {
1976         ScalarEvolution *SE = MemCheckExp.getSE();
1977         // TODO: If profitable, we could refine this further by analysing every
1978         // individual memory check, since there could be a mixture of loop
1979         // variant and invariant checks that mean the final condition is
1980         // variant.
1981         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1982         if (SE->isLoopInvariant(Cond, OuterLoop)) {
1983           // It seems reasonable to assume that we can reduce the effective
1984           // cost of the checks even when we know nothing about the trip
1985           // count. Assume that the outer loop executes at least twice.
1986           unsigned BestTripCount = 2;
1987 
1988           // Get the best known TC estimate.
1989           if (auto EstimatedTC = getSmallBestKnownTC(
1990                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
1991             BestTripCount = *EstimatedTC;
1992 
1993           BestTripCount = std::max(BestTripCount, 1U);
1994           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1995 
1996           // Let's ensure the cost is always at least 1.
1997           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
1998                                      (InstructionCost::CostType)1);
1999 
2000           if (BestTripCount > 1)
2001             LLVM_DEBUG(dbgs()
2002                        << "We expect runtime memory checks to be hoisted "
2003                        << "out of the outer loop. Cost reduced from "
2004                        << MemCheckCost << " to " << NewMemCheckCost << '\n');
2005 
2006           MemCheckCost = NewMemCheckCost;
2007         }
2008       }
2009 
2010       RTCheckCost += MemCheckCost;
2011     }
2012 
2013     if (SCEVCheckBlock || MemCheckBlock)
2014       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2015                         << "\n");
2016 
2017     return RTCheckCost;
2018   }
2019 
2020   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2021   /// unused.
2022   ~GeneratedRTChecks() {
2023     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2024     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2025     if (!SCEVCheckCond)
2026       SCEVCleaner.markResultUsed();
2027 
2028     if (!MemRuntimeCheckCond)
2029       MemCheckCleaner.markResultUsed();
2030 
2031     if (MemRuntimeCheckCond) {
2032       auto &SE = *MemCheckExp.getSE();
2033       // Memory runtime check generation creates compares that use expanded
2034       // values. Remove them before running the SCEVExpanderCleaners.
2035       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2036         if (MemCheckExp.isInsertedInstruction(&I))
2037           continue;
2038         SE.forgetValue(&I);
2039         I.eraseFromParent();
2040       }
2041     }
2042     MemCheckCleaner.cleanup();
2043     SCEVCleaner.cleanup();
2044 
2045     if (SCEVCheckCond)
2046       SCEVCheckBlock->eraseFromParent();
2047     if (MemRuntimeCheckCond)
2048       MemCheckBlock->eraseFromParent();
2049   }
2050 
2051   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2052   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2053   /// depending on the generated condition.
2054   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2055                              BasicBlock *LoopVectorPreHeader) {
2056     if (!SCEVCheckCond)
2057       return nullptr;
2058 
2059     Value *Cond = SCEVCheckCond;
2060     // Mark the check as used, to prevent it from being removed during cleanup.
2061     SCEVCheckCond = nullptr;
2062     if (auto *C = dyn_cast<ConstantInt>(Cond))
2063       if (C->isZero())
2064         return nullptr;
2065 
2066     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2067 
2068     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2069     // Create new preheader for vector loop.
2070     if (OuterLoop)
2071       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2072 
2073     SCEVCheckBlock->getTerminator()->eraseFromParent();
2074     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2075     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2076                                                 SCEVCheckBlock);
2077 
2078     DT->addNewBlock(SCEVCheckBlock, Pred);
2079     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2080 
2081     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2082     if (AddBranchWeights)
2083       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2084     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2085     return SCEVCheckBlock;
2086   }
2087 
2088   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2089   /// the branches to branch to the vector preheader or \p Bypass, depending on
2090   /// the generated condition.
2091   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2092                                    BasicBlock *LoopVectorPreHeader) {
2093     // Check if we generated code that checks in runtime if arrays overlap.
2094     if (!MemRuntimeCheckCond)
2095       return nullptr;
2096 
2097     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2098     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2099                                                 MemCheckBlock);
2100 
2101     DT->addNewBlock(MemCheckBlock, Pred);
2102     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2103     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2104 
2105     if (OuterLoop)
2106       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2107 
2108     BranchInst &BI =
2109         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2110     if (AddBranchWeights) {
2111       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2112     }
2113     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2114     MemCheckBlock->getTerminator()->setDebugLoc(
2115         Pred->getTerminator()->getDebugLoc());
2116 
2117     // Mark the check as used, to prevent it from being removed during cleanup.
2118     MemRuntimeCheckCond = nullptr;
2119     return MemCheckBlock;
2120   }
2121 };
2122 } // namespace
2123 
2124 static bool useActiveLaneMask(TailFoldingStyle Style) {
2125   return Style == TailFoldingStyle::Data ||
2126          Style == TailFoldingStyle::DataAndControlFlow ||
2127          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2128 }
2129 
2130 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2131   return Style == TailFoldingStyle::DataAndControlFlow ||
2132          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2133 }
2134 
2135 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2136 // vectorization. The loop needs to be annotated with #pragma omp simd
2137 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2138 // vector length information is not provided, vectorization is not considered
2139 // explicit. Interleave hints are not allowed either. These limitations will be
2140 // relaxed in the future.
2141 // Please, note that we are currently forced to abuse the pragma 'clang
2142 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2143 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2144 // provides *explicit vectorization hints* (LV can bypass legal checks and
2145 // assume that vectorization is legal). However, both hints are implemented
2146 // using the same metadata (llvm.loop.vectorize, processed by
2147 // LoopVectorizeHints). This will be fixed in the future when the native IR
2148 // representation for pragma 'omp simd' is introduced.
2149 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2150                                    OptimizationRemarkEmitter *ORE) {
2151   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2152   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2153 
2154   // Only outer loops with an explicit vectorization hint are supported.
2155   // Unannotated outer loops are ignored.
2156   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2157     return false;
2158 
2159   Function *Fn = OuterLp->getHeader()->getParent();
2160   if (!Hints.allowVectorization(Fn, OuterLp,
2161                                 true /*VectorizeOnlyWhenForced*/)) {
2162     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2163     return false;
2164   }
2165 
2166   if (Hints.getInterleave() > 1) {
2167     // TODO: Interleave support is future work.
2168     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2169                          "outer loops.\n");
2170     Hints.emitRemarkWithHints();
2171     return false;
2172   }
2173 
2174   return true;
2175 }
2176 
2177 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2178                                   OptimizationRemarkEmitter *ORE,
2179                                   SmallVectorImpl<Loop *> &V) {
2180   // Collect inner loops and outer loops without irreducible control flow. For
2181   // now, only collect outer loops that have explicit vectorization hints. If we
2182   // are stress testing the VPlan H-CFG construction, we collect the outermost
2183   // loop of every loop nest.
2184   if (L.isInnermost() || VPlanBuildStressTest ||
2185       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2186     LoopBlocksRPO RPOT(&L);
2187     RPOT.perform(LI);
2188     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2189       V.push_back(&L);
2190       // TODO: Collect inner loops inside marked outer loops in case
2191       // vectorization fails for the outer loop. Do not invoke
2192       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2193       // already known to be reducible. We can use an inherited attribute for
2194       // that.
2195       return;
2196     }
2197   }
2198   for (Loop *InnerL : L)
2199     collectSupportedLoops(*InnerL, LI, ORE, V);
2200 }
2201 
2202 //===----------------------------------------------------------------------===//
2203 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2204 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2205 //===----------------------------------------------------------------------===//
2206 
2207 /// Compute the transformed value of Index at offset StartValue using step
2208 /// StepValue.
2209 /// For integer induction, returns StartValue + Index * StepValue.
2210 /// For pointer induction, returns StartValue[Index * StepValue].
2211 /// FIXME: The newly created binary instructions should contain nsw/nuw
2212 /// flags, which can be found from the original scalar operations.
2213 static Value *
2214 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2215                      Value *Step,
2216                      InductionDescriptor::InductionKind InductionKind,
2217                      const BinaryOperator *InductionBinOp) {
2218   Type *StepTy = Step->getType();
2219   Value *CastedIndex = StepTy->isIntegerTy()
2220                            ? B.CreateSExtOrTrunc(Index, StepTy)
2221                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2222   if (CastedIndex != Index) {
2223     CastedIndex->setName(CastedIndex->getName() + ".cast");
2224     Index = CastedIndex;
2225   }
2226 
2227   // Note: the IR at this point is broken. We cannot use SE to create any new
2228   // SCEV and then expand it, hoping that SCEV's simplification will give us
2229   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2230   // lead to various SCEV crashes. So all we can do is to use builder and rely
2231   // on InstCombine for future simplifications. Here we handle some trivial
2232   // cases only.
2233   auto CreateAdd = [&B](Value *X, Value *Y) {
2234     assert(X->getType() == Y->getType() && "Types don't match!");
2235     if (auto *CX = dyn_cast<ConstantInt>(X))
2236       if (CX->isZero())
2237         return Y;
2238     if (auto *CY = dyn_cast<ConstantInt>(Y))
2239       if (CY->isZero())
2240         return X;
2241     return B.CreateAdd(X, Y);
2242   };
2243 
2244   // We allow X to be a vector type, in which case Y will potentially be
2245   // splatted into a vector with the same element count.
2246   auto CreateMul = [&B](Value *X, Value *Y) {
2247     assert(X->getType()->getScalarType() == Y->getType() &&
2248            "Types don't match!");
2249     if (auto *CX = dyn_cast<ConstantInt>(X))
2250       if (CX->isOne())
2251         return Y;
2252     if (auto *CY = dyn_cast<ConstantInt>(Y))
2253       if (CY->isOne())
2254         return X;
2255     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2256     if (XVTy && !isa<VectorType>(Y->getType()))
2257       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2258     return B.CreateMul(X, Y);
2259   };
2260 
2261   switch (InductionKind) {
2262   case InductionDescriptor::IK_IntInduction: {
2263     assert(!isa<VectorType>(Index->getType()) &&
2264            "Vector indices not supported for integer inductions yet");
2265     assert(Index->getType() == StartValue->getType() &&
2266            "Index type does not match StartValue type");
2267     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2268       return B.CreateSub(StartValue, Index);
2269     auto *Offset = CreateMul(Index, Step);
2270     return CreateAdd(StartValue, Offset);
2271   }
2272   case InductionDescriptor::IK_PtrInduction:
2273     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2274   case InductionDescriptor::IK_FpInduction: {
2275     assert(!isa<VectorType>(Index->getType()) &&
2276            "Vector indices not supported for FP inductions yet");
2277     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2278     assert(InductionBinOp &&
2279            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2280             InductionBinOp->getOpcode() == Instruction::FSub) &&
2281            "Original bin op should be defined for FP induction");
2282 
2283     Value *MulExp = B.CreateFMul(Step, Index);
2284     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2285                          "induction");
2286   }
2287   case InductionDescriptor::IK_NoInduction:
2288     return nullptr;
2289   }
2290   llvm_unreachable("invalid enum");
2291 }
2292 
2293 std::optional<unsigned> getMaxVScale(const Function &F,
2294                                      const TargetTransformInfo &TTI) {
2295   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2296     return MaxVScale;
2297 
2298   if (F.hasFnAttribute(Attribute::VScaleRange))
2299     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2300 
2301   return std::nullopt;
2302 }
2303 
2304 /// For the given VF and UF and maximum trip count computed for the loop, return
2305 /// whether the induction variable might overflow in the vectorized loop. If not,
2306 /// then we know a runtime overflow check always evaluates to false and can be
2307 /// removed.
2308 static bool isIndvarOverflowCheckKnownFalse(
2309     const LoopVectorizationCostModel *Cost,
2310     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2311   // Always be conservative if we don't know the exact unroll factor.
2312   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2313 
2314   Type *IdxTy = Cost->Legal->getWidestInductionType();
2315   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2316 
2317   // We know the runtime overflow check is known false iff the (max) trip-count
2318   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2319   // the vector loop induction variable.
2320   if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2321     uint64_t MaxVF = VF.getKnownMinValue();
2322     if (VF.isScalable()) {
2323       std::optional<unsigned> MaxVScale =
2324           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2325       if (!MaxVScale)
2326         return false;
2327       MaxVF *= *MaxVScale;
2328     }
2329 
2330     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2331   }
2332 
2333   return false;
2334 }
2335 
2336 // Return whether we allow using masked interleave-groups (for dealing with
2337 // strided loads/stores that reside in predicated blocks, or for dealing
2338 // with gaps).
2339 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2340   // If an override option has been passed in for interleaved accesses, use it.
2341   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2342     return EnableMaskedInterleavedMemAccesses;
2343 
2344   return TTI.enableMaskedInterleavedAccessVectorization();
2345 }
2346 
2347 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2348                                                VPReplicateRecipe *RepRecipe,
2349                                                const VPLane &Lane,
2350                                                VPTransformState &State) {
2351   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2352 
2353   // Does this instruction return a value ?
2354   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2355 
2356   Instruction *Cloned = Instr->clone();
2357   if (!IsVoidRetTy) {
2358     Cloned->setName(Instr->getName() + ".cloned");
2359 #if !defined(NDEBUG)
2360     // Verify that VPlan type inference results agree with the type of the
2361     // generated values.
2362     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2363            "inferred type and type from generated instructions do not match");
2364 #endif
2365   }
2366 
2367   RepRecipe->setFlags(Cloned);
2368 
2369   if (auto DL = Instr->getDebugLoc())
2370     State.setDebugLocFrom(DL);
2371 
2372   // Replace the operands of the cloned instructions with their scalar
2373   // equivalents in the new loop.
2374   for (const auto &I : enumerate(RepRecipe->operands())) {
2375     auto InputLane = Lane;
2376     VPValue *Operand = I.value();
2377     if (vputils::isUniformAfterVectorization(Operand))
2378       InputLane = VPLane::getFirstLane();
2379     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2380   }
2381   State.addNewMetadata(Cloned, Instr);
2382 
2383   // Place the cloned scalar in the new loop.
2384   State.Builder.Insert(Cloned);
2385 
2386   State.set(RepRecipe, Cloned, Lane);
2387 
2388   // If we just cloned a new assumption, add it the assumption cache.
2389   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2390     AC->registerAssumption(II);
2391 
2392   // End if-block.
2393   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2394   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2395   assert(
2396       (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2397        all_of(RepRecipe->operands(),
2398               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2399       "Expected a recipe is either within a region or all of its operands "
2400       "are defined outside the vectorized region.");
2401   if (IfPredicateInstr)
2402     PredicatedInstructions.push_back(Cloned);
2403 }
2404 
2405 Value *
2406 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2407   if (VectorTripCount)
2408     return VectorTripCount;
2409 
2410   Value *TC = getTripCount();
2411   IRBuilder<> Builder(InsertBlock->getTerminator());
2412 
2413   Type *Ty = TC->getType();
2414   // This is where we can make the step a runtime constant.
2415   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2416 
2417   // If the tail is to be folded by masking, round the number of iterations N
2418   // up to a multiple of Step instead of rounding down. This is done by first
2419   // adding Step-1 and then rounding down. Note that it's ok if this addition
2420   // overflows: the vector induction variable will eventually wrap to zero given
2421   // that it starts at zero and its Step is a power of two; the loop will then
2422   // exit, with the last early-exit vector comparison also producing all-true.
2423   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2424   // is accounted for in emitIterationCountCheck that adds an overflow check.
2425   if (Cost->foldTailByMasking()) {
2426     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2427            "VF*UF must be a power of 2 when folding tail by masking");
2428     TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2429                            "n.rnd.up");
2430   }
2431 
2432   // Now we need to generate the expression for the part of the loop that the
2433   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2434   // iterations are not required for correctness, or N - Step, otherwise. Step
2435   // is equal to the vectorization factor (number of SIMD elements) times the
2436   // unroll factor (number of SIMD instructions).
2437   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2438 
2439   // There are cases where we *must* run at least one iteration in the remainder
2440   // loop.  See the cost model for when this can happen.  If the step evenly
2441   // divides the trip count, we set the remainder to be equal to the step. If
2442   // the step does not evenly divide the trip count, no adjustment is necessary
2443   // since there will already be scalar iterations. Note that the minimum
2444   // iterations check ensures that N >= Step.
2445   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2446     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2447     R = Builder.CreateSelect(IsZero, Step, R);
2448   }
2449 
2450   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2451 
2452   return VectorTripCount;
2453 }
2454 
2455 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2456   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2457   VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2458   if (PreVectorPH->getNumSuccessors() != 1) {
2459     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2460     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2461            "Unexpected successor");
2462     VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2463     VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2464     PreVectorPH = CheckVPIRBB;
2465   }
2466   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2467   PreVectorPH->swapSuccessors();
2468 }
2469 
2470 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2471   Value *Count = getTripCount();
2472   // Reuse existing vector loop preheader for TC checks.
2473   // Note that new preheader block is generated for vector loop.
2474   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2475   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2476 
2477   // Generate code to check if the loop's trip count is less than VF * UF, or
2478   // equal to it in case a scalar epilogue is required; this implies that the
2479   // vector trip count is zero. This check also covers the case where adding one
2480   // to the backedge-taken count overflowed leading to an incorrect trip count
2481   // of zero. In this case we will also jump to the scalar loop.
2482   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2483                                                        : ICmpInst::ICMP_ULT;
2484 
2485   // If tail is to be folded, vector loop takes care of all iterations.
2486   Type *CountTy = Count->getType();
2487   Value *CheckMinIters = Builder.getFalse();
2488   auto CreateStep = [&]() -> Value * {
2489     // Create step with max(MinProTripCount, UF * VF).
2490     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2491       return createStepForVF(Builder, CountTy, VF, UF);
2492 
2493     Value *MinProfTC =
2494         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2495     if (!VF.isScalable())
2496       return MinProfTC;
2497     return Builder.CreateBinaryIntrinsic(
2498         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2499   };
2500 
2501   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2502   if (Style == TailFoldingStyle::None) {
2503     Value *Step = CreateStep();
2504     ScalarEvolution &SE = *PSE.getSE();
2505     // TODO: Emit unconditional branch to vector preheader instead of
2506     // conditional branch with known condition.
2507     const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2508     // Check if the trip count is < the step.
2509     if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2510       // TODO: Ensure step is at most the trip count when determining max VF and
2511       // UF, w/o tail folding.
2512       CheckMinIters = Builder.getTrue();
2513     } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),
2514                                     TripCountSCEV, SE.getSCEV(Step))) {
2515       // Generate the minimum iteration check only if we cannot prove the
2516       // check is known to be true, or known to be false.
2517       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2518     } // else step known to be < trip count, use CheckMinIters preset to false.
2519   } else if (VF.isScalable() &&
2520              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2521              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2522     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2523     // an overflow to zero when updating induction variables and so an
2524     // additional overflow check is required before entering the vector loop.
2525 
2526     // Get the maximum unsigned value for the type.
2527     Value *MaxUIntTripCount =
2528         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2529     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2530 
2531     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2532     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2533   }
2534 
2535   // Create new preheader for vector loop.
2536   LoopVectorPreHeader =
2537       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2538                  "vector.ph");
2539 
2540   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2541                                DT->getNode(Bypass)->getIDom()) &&
2542          "TC check is expected to dominate Bypass");
2543 
2544   BranchInst &BI =
2545       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2546   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2547     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2548   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2549   LoopBypassBlocks.push_back(TCCheckBlock);
2550 
2551   // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2552   introduceCheckBlockInVPlan(TCCheckBlock);
2553 }
2554 
2555 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2556   BasicBlock *const SCEVCheckBlock =
2557       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2558   if (!SCEVCheckBlock)
2559     return nullptr;
2560 
2561   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2562            (OptForSizeBasedOnProfile &&
2563             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2564          "Cannot SCEV check stride or overflow when optimizing for size");
2565   assert(!LoopBypassBlocks.empty() &&
2566          "Should already be a bypass block due to iteration count check");
2567   LoopBypassBlocks.push_back(SCEVCheckBlock);
2568   AddedSafetyChecks = true;
2569 
2570   introduceCheckBlockInVPlan(SCEVCheckBlock);
2571   return SCEVCheckBlock;
2572 }
2573 
2574 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2575   // VPlan-native path does not do any analysis for runtime checks currently.
2576   if (EnableVPlanNativePath)
2577     return nullptr;
2578 
2579   BasicBlock *const MemCheckBlock =
2580       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2581 
2582   // Check if we generated code that checks in runtime if arrays overlap. We put
2583   // the checks into a separate block to make the more common case of few
2584   // elements faster.
2585   if (!MemCheckBlock)
2586     return nullptr;
2587 
2588   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2589     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2590            "Cannot emit memory checks when optimizing for size, unless forced "
2591            "to vectorize.");
2592     ORE->emit([&]() {
2593       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2594                                         OrigLoop->getStartLoc(),
2595                                         OrigLoop->getHeader())
2596              << "Code-size may be reduced by not forcing "
2597                 "vectorization, or by source-code modifications "
2598                 "eliminating the need for runtime checks "
2599                 "(e.g., adding 'restrict').";
2600     });
2601   }
2602 
2603   LoopBypassBlocks.push_back(MemCheckBlock);
2604 
2605   AddedSafetyChecks = true;
2606 
2607   introduceCheckBlockInVPlan(MemCheckBlock);
2608   return MemCheckBlock;
2609 }
2610 
2611 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2612 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2613 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2614 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2615 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2616   VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2617   for (auto &R : make_early_inc_range(*VPBB)) {
2618     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2619     R.moveBefore(*IRVPBB, IRVPBB->end());
2620   }
2621 
2622   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2623   // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2624 }
2625 
2626 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2627   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2628   assert(LoopVectorPreHeader && "Invalid loop structure");
2629   assert((OrigLoop->getUniqueLatchExitBlock() ||
2630           Cost->requiresScalarEpilogue(VF.isVector())) &&
2631          "loops not exiting via the latch without required epilogue?");
2632 
2633   LoopMiddleBlock =
2634       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2635                  LI, nullptr, Twine(Prefix) + "middle.block");
2636   replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
2637   LoopScalarPreHeader =
2638       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2639                  nullptr, Twine(Prefix) + "scalar.ph");
2640   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
2641 }
2642 
2643 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2644 /// expansion results.
2645 static Value *getExpandedStep(const InductionDescriptor &ID,
2646                               const SCEV2ValueTy &ExpandedSCEVs) {
2647   const SCEV *Step = ID.getStep();
2648   if (auto *C = dyn_cast<SCEVConstant>(Step))
2649     return C->getValue();
2650   if (auto *U = dyn_cast<SCEVUnknown>(Step))
2651     return U->getValue();
2652   auto I = ExpandedSCEVs.find(Step);
2653   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2654   return I->second;
2655 }
2656 
2657 /// Knowing that loop \p L executes a single vector iteration, add instructions
2658 /// that will get simplified and thus should not have any cost to \p
2659 /// InstsToIgnore.
2660 static void addFullyUnrolledInstructionsToIgnore(
2661     Loop *L, const LoopVectorizationLegality::InductionList &IL,
2662     SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2663   auto *Cmp = L->getLatchCmpInst();
2664   if (Cmp)
2665     InstsToIgnore.insert(Cmp);
2666   for (const auto &KV : IL) {
2667     // Extract the key by hand so that it can be used in the lambda below.  Note
2668     // that captured structured bindings are a C++20 extension.
2669     const PHINode *IV = KV.first;
2670 
2671     // Get next iteration value of the induction variable.
2672     Instruction *IVInst =
2673         cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2674     if (all_of(IVInst->users(),
2675                [&](const User *U) { return U == IV || U == Cmp; }))
2676       InstsToIgnore.insert(IVInst);
2677   }
2678 }
2679 
2680 void InnerLoopVectorizer::createInductionAdditionalBypassValues(
2681     const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2682   assert(MainVectorTripCount && "Must have bypass information");
2683 
2684   Instruction *OldInduction = Legal->getPrimaryInduction();
2685   IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2686                             getAdditionalBypassBlock()->getFirstInsertionPt());
2687   for (const auto &InductionEntry : Legal->getInductionVars()) {
2688     PHINode *OrigPhi = InductionEntry.first;
2689     const InductionDescriptor &II = InductionEntry.second;
2690     Value *Step = getExpandedStep(II, ExpandedSCEVs);
2691     // For the primary induction the additional bypass end value is known.
2692     // Otherwise it is computed.
2693     Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2694     if (OrigPhi != OldInduction) {
2695       auto *BinOp = II.getInductionBinOp();
2696       // Fast-math-flags propagate from the original induction instruction.
2697       if (isa_and_nonnull<FPMathOperator>(BinOp))
2698         BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2699 
2700       // Compute the end value for the additional bypass.
2701       EndValueFromAdditionalBypass =
2702           emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2703                                II.getStartValue(), Step, II.getKind(), BinOp);
2704       EndValueFromAdditionalBypass->setName("ind.end");
2705     }
2706 
2707     // Store the bypass value here, as it needs to be added as operand to its
2708     // scalar preheader phi node after the epilogue skeleton has been created.
2709     // TODO: Directly add as extra operand to the VPResumePHI recipe.
2710     assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2711            "entry for OrigPhi already exits");
2712     Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2713   }
2714 }
2715 
2716 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2717     const SCEV2ValueTy &ExpandedSCEVs) {
2718   /*
2719    In this function we generate a new loop. The new loop will contain
2720    the vectorized instructions while the old loop will continue to run the
2721    scalar remainder.
2722 
2723        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2724      /  |      preheader are expanded here. Eventually all required SCEV
2725     /   |      expansion should happen here.
2726    /    v
2727   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2728   |  /  |
2729   | /   v
2730   ||   [ ]     <-- vector pre header.
2731   |/    |
2732   |     v
2733   |    [  ] \
2734   |    [  ]_|   <-- vector loop (created during VPlan execution).
2735   |     |
2736   |     v
2737   \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2738    |    |                       successors created during VPlan execution)
2739    \/   |
2740    /\   v
2741    | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
2742    |    |
2743  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
2744    |   [ ] \
2745    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue, header
2746    |    |          wrapped in VPIRBasicBlock).
2747     \   |
2748      \  v
2749       >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
2750    ...
2751    */
2752 
2753   // Create an empty vector loop, and prepare basic blocks for the runtime
2754   // checks.
2755   createVectorLoopSkeleton("");
2756 
2757   // Now, compare the new count to zero. If it is zero skip the vector loop and
2758   // jump to the scalar loop. This check also covers the case where the
2759   // backedge-taken count is uint##_max: adding one to it will overflow leading
2760   // to an incorrect trip count of zero. In this (rare) case we will also jump
2761   // to the scalar loop.
2762   emitIterationCountCheck(LoopScalarPreHeader);
2763 
2764   // Generate the code to check any assumptions that we've made for SCEV
2765   // expressions.
2766   emitSCEVChecks(LoopScalarPreHeader);
2767 
2768   // Generate the code that checks in runtime if arrays overlap. We put the
2769   // checks into a separate block to make the more common case of few elements
2770   // faster.
2771   emitMemRuntimeChecks(LoopScalarPreHeader);
2772 
2773   return LoopVectorPreHeader;
2774 }
2775 
2776 namespace {
2777 
2778 struct CSEDenseMapInfo {
2779   static bool canHandle(const Instruction *I) {
2780     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2781            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2782   }
2783 
2784   static inline Instruction *getEmptyKey() {
2785     return DenseMapInfo<Instruction *>::getEmptyKey();
2786   }
2787 
2788   static inline Instruction *getTombstoneKey() {
2789     return DenseMapInfo<Instruction *>::getTombstoneKey();
2790   }
2791 
2792   static unsigned getHashValue(const Instruction *I) {
2793     assert(canHandle(I) && "Unknown instruction!");
2794     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2795                                                            I->value_op_end()));
2796   }
2797 
2798   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2799     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2800         LHS == getTombstoneKey() || RHS == getTombstoneKey())
2801       return LHS == RHS;
2802     return LHS->isIdenticalTo(RHS);
2803   }
2804 };
2805 
2806 } // end anonymous namespace
2807 
2808 ///Perform cse of induction variable instructions.
2809 static void cse(BasicBlock *BB) {
2810   // Perform simple cse.
2811   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2812   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2813     if (!CSEDenseMapInfo::canHandle(&In))
2814       continue;
2815 
2816     // Check if we can replace this instruction with any of the
2817     // visited instructions.
2818     if (Instruction *V = CSEMap.lookup(&In)) {
2819       In.replaceAllUsesWith(V);
2820       In.eraseFromParent();
2821       continue;
2822     }
2823 
2824     CSEMap[&In] = &In;
2825   }
2826 }
2827 
2828 InstructionCost
2829 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2830                                               ElementCount VF) const {
2831   // We only need to calculate a cost if the VF is scalar; for actual vectors
2832   // we should already have a pre-calculated cost at each VF.
2833   if (!VF.isScalar())
2834     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2835 
2836   Type *RetTy = CI->getType();
2837   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2838     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2839       return *RedCost;
2840 
2841   SmallVector<Type *, 4> Tys;
2842   for (auto &ArgOp : CI->args())
2843     Tys.push_back(ArgOp->getType());
2844 
2845   InstructionCost ScalarCallCost =
2846       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2847 
2848   // If this is an intrinsic we may have a lower cost for it.
2849   if (getVectorIntrinsicIDForCall(CI, TLI)) {
2850     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2851     return std::min(ScalarCallCost, IntrinsicCost);
2852   }
2853   return ScalarCallCost;
2854 }
2855 
2856 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2857   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2858     return Elt;
2859   return VectorType::get(Elt, VF);
2860 }
2861 
2862 InstructionCost
2863 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2864                                                    ElementCount VF) const {
2865   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2866   assert(ID && "Expected intrinsic call!");
2867   Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2868   FastMathFlags FMF;
2869   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2870     FMF = FPMO->getFastMathFlags();
2871 
2872   SmallVector<const Value *> Arguments(CI->args());
2873   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2874   SmallVector<Type *> ParamTys;
2875   std::transform(FTy->param_begin(), FTy->param_end(),
2876                  std::back_inserter(ParamTys),
2877                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2878 
2879   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2880                                     dyn_cast<IntrinsicInst>(CI));
2881   return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2882 }
2883 
2884 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2885   // Fix widened non-induction PHIs by setting up the PHI operands.
2886   if (EnableVPlanNativePath)
2887     fixNonInductionPHIs(State);
2888 
2889   // Forget the original basic block.
2890   PSE.getSE()->forgetLoop(OrigLoop);
2891   PSE.getSE()->forgetBlockAndLoopDispositions();
2892 
2893   // After vectorization, the exit blocks of the original loop will have
2894   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2895   // looked through single-entry phis.
2896   SmallVector<BasicBlock *> ExitBlocks;
2897   OrigLoop->getExitBlocks(ExitBlocks);
2898   for (BasicBlock *Exit : ExitBlocks)
2899     for (PHINode &PN : Exit->phis())
2900       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2901 
2902   // Don't apply optimizations below when no vector region remains, as they all
2903   // require a vector loop at the moment.
2904   if (!State.Plan->getVectorLoopRegion())
2905     return;
2906 
2907   for (Instruction *PI : PredicatedInstructions)
2908     sinkScalarOperands(&*PI);
2909 
2910   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2911   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
2912   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2913 
2914   // Remove redundant induction instructions.
2915   cse(HeaderBB);
2916 
2917   // Set/update profile weights for the vector and remainder loops as original
2918   // loop iterations are now distributed among them. Note that original loop
2919   // becomes the scalar remainder loop after vectorization.
2920   //
2921   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2922   // end up getting slightly roughened result but that should be OK since
2923   // profile is not inherently precise anyway. Note also possible bypass of
2924   // vector code caused by legality checks is ignored, assigning all the weight
2925   // to the vector loop, optimistically.
2926   //
2927   // For scalable vectorization we can't know at compile time how many
2928   // iterations of the loop are handled in one vector iteration, so instead
2929   // assume a pessimistic vscale of '1'.
2930   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2931   setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
2932                                VF.getKnownMinValue() * UF);
2933 }
2934 
2935 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
2936   // The basic block and loop containing the predicated instruction.
2937   auto *PredBB = PredInst->getParent();
2938   auto *VectorLoop = LI->getLoopFor(PredBB);
2939 
2940   // Initialize a worklist with the operands of the predicated instruction.
2941   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
2942 
2943   // Holds instructions that we need to analyze again. An instruction may be
2944   // reanalyzed if we don't yet know if we can sink it or not.
2945   SmallVector<Instruction *, 8> InstsToReanalyze;
2946 
2947   // Returns true if a given use occurs in the predicated block. Phi nodes use
2948   // their operands in their corresponding predecessor blocks.
2949   auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
2950     auto *I = cast<Instruction>(U.getUser());
2951     BasicBlock *BB = I->getParent();
2952     if (auto *Phi = dyn_cast<PHINode>(I))
2953       BB = Phi->getIncomingBlock(
2954           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
2955     return BB == PredBB;
2956   };
2957 
2958   // Iteratively sink the scalarized operands of the predicated instruction
2959   // into the block we created for it. When an instruction is sunk, it's
2960   // operands are then added to the worklist. The algorithm ends after one pass
2961   // through the worklist doesn't sink a single instruction.
2962   bool Changed;
2963   do {
2964     // Add the instructions that need to be reanalyzed to the worklist, and
2965     // reset the changed indicator.
2966     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
2967     InstsToReanalyze.clear();
2968     Changed = false;
2969 
2970     while (!Worklist.empty()) {
2971       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
2972 
2973       // We can't sink an instruction if it is a phi node, is not in the loop,
2974       // may have side effects or may read from memory.
2975       // TODO: Could do more granular checking to allow sinking
2976       // a load past non-store instructions.
2977       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
2978           I->mayHaveSideEffects() || I->mayReadFromMemory())
2979           continue;
2980 
2981       // If the instruction is already in PredBB, check if we can sink its
2982       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
2983       // sinking the scalar instruction I, hence it appears in PredBB; but it
2984       // may have failed to sink I's operands (recursively), which we try
2985       // (again) here.
2986       if (I->getParent() == PredBB) {
2987         Worklist.insert(I->op_begin(), I->op_end());
2988         continue;
2989       }
2990 
2991       // It's legal to sink the instruction if all its uses occur in the
2992       // predicated block. Otherwise, there's nothing to do yet, and we may
2993       // need to reanalyze the instruction.
2994       if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
2995         InstsToReanalyze.push_back(I);
2996         continue;
2997       }
2998 
2999       // Move the instruction to the beginning of the predicated block, and add
3000       // it's operands to the worklist.
3001       I->moveBefore(&*PredBB->getFirstInsertionPt());
3002       Worklist.insert(I->op_begin(), I->op_end());
3003 
3004       // The sinking may have enabled other instructions to be sunk, so we will
3005       // need to iterate.
3006       Changed = true;
3007     }
3008   } while (Changed);
3009 }
3010 
3011 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
3012   auto Iter = vp_depth_first_deep(Plan.getEntry());
3013   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3014     for (VPRecipeBase &P : VPBB->phis()) {
3015       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3016       if (!VPPhi)
3017         continue;
3018       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3019       // Make sure the builder has a valid insert point.
3020       Builder.SetInsertPoint(NewPhi);
3021       for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3022         VPValue *Inc = VPPhi->getIncomingValue(Idx);
3023         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3024         NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3025       }
3026     }
3027   }
3028 }
3029 
3030 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3031   // We should not collect Scalars more than once per VF. Right now, this
3032   // function is called from collectUniformsAndScalars(), which already does
3033   // this check. Collecting Scalars for VF=1 does not make any sense.
3034   assert(VF.isVector() && !Scalars.contains(VF) &&
3035          "This function should not be visited twice for the same VF");
3036 
3037   // This avoids any chances of creating a REPLICATE recipe during planning
3038   // since that would result in generation of scalarized code during execution,
3039   // which is not supported for scalable vectors.
3040   if (VF.isScalable()) {
3041     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3042     return;
3043   }
3044 
3045   SmallSetVector<Instruction *, 8> Worklist;
3046 
3047   // These sets are used to seed the analysis with pointers used by memory
3048   // accesses that will remain scalar.
3049   SmallSetVector<Instruction *, 8> ScalarPtrs;
3050   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3051   auto *Latch = TheLoop->getLoopLatch();
3052 
3053   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3054   // The pointer operands of loads and stores will be scalar as long as the
3055   // memory access is not a gather or scatter operation. The value operand of a
3056   // store will remain scalar if the store is scalarized.
3057   auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3058     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3059     assert(WideningDecision != CM_Unknown &&
3060            "Widening decision should be ready at this moment");
3061     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3062       if (Ptr == Store->getValueOperand())
3063         return WideningDecision == CM_Scalarize;
3064     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3065            "Ptr is neither a value or pointer operand");
3066     return WideningDecision != CM_GatherScatter;
3067   };
3068 
3069   // A helper that returns true if the given value is a getelementptr
3070   // instruction contained in the loop.
3071   auto IsLoopVaryingGEP = [&](Value *V) {
3072     return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3073   };
3074 
3075   // A helper that evaluates a memory access's use of a pointer. If the use will
3076   // be a scalar use and the pointer is only used by memory accesses, we place
3077   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3078   // PossibleNonScalarPtrs.
3079   auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3080     // We only care about bitcast and getelementptr instructions contained in
3081     // the loop.
3082     if (!IsLoopVaryingGEP(Ptr))
3083       return;
3084 
3085     // If the pointer has already been identified as scalar (e.g., if it was
3086     // also identified as uniform), there's nothing to do.
3087     auto *I = cast<Instruction>(Ptr);
3088     if (Worklist.count(I))
3089       return;
3090 
3091     // If the use of the pointer will be a scalar use, and all users of the
3092     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3093     // place the pointer in PossibleNonScalarPtrs.
3094     if (IsScalarUse(MemAccess, Ptr) &&
3095         all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3096       ScalarPtrs.insert(I);
3097     else
3098       PossibleNonScalarPtrs.insert(I);
3099   };
3100 
3101   // We seed the scalars analysis with three classes of instructions: (1)
3102   // instructions marked uniform-after-vectorization and (2) bitcast,
3103   // getelementptr and (pointer) phi instructions used by memory accesses
3104   // requiring a scalar use.
3105   //
3106   // (1) Add to the worklist all instructions that have been identified as
3107   // uniform-after-vectorization.
3108   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3109 
3110   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3111   // memory accesses requiring a scalar use. The pointer operands of loads and
3112   // stores will be scalar unless the operation is a gather or scatter.
3113   // The value operand of a store will remain scalar if the store is scalarized.
3114   for (auto *BB : TheLoop->blocks())
3115     for (auto &I : *BB) {
3116       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3117         EvaluatePtrUse(Load, Load->getPointerOperand());
3118       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3119         EvaluatePtrUse(Store, Store->getPointerOperand());
3120         EvaluatePtrUse(Store, Store->getValueOperand());
3121       }
3122     }
3123   for (auto *I : ScalarPtrs)
3124     if (!PossibleNonScalarPtrs.count(I)) {
3125       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3126       Worklist.insert(I);
3127     }
3128 
3129   // Insert the forced scalars.
3130   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3131   // induction variable when the PHI user is scalarized.
3132   auto ForcedScalar = ForcedScalars.find(VF);
3133   if (ForcedScalar != ForcedScalars.end())
3134     for (auto *I : ForcedScalar->second) {
3135       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3136       Worklist.insert(I);
3137     }
3138 
3139   // Expand the worklist by looking through any bitcasts and getelementptr
3140   // instructions we've already identified as scalar. This is similar to the
3141   // expansion step in collectLoopUniforms(); however, here we're only
3142   // expanding to include additional bitcasts and getelementptr instructions.
3143   unsigned Idx = 0;
3144   while (Idx != Worklist.size()) {
3145     Instruction *Dst = Worklist[Idx++];
3146     if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3147       continue;
3148     auto *Src = cast<Instruction>(Dst->getOperand(0));
3149     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3150           auto *J = cast<Instruction>(U);
3151           return !TheLoop->contains(J) || Worklist.count(J) ||
3152                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3153                   IsScalarUse(J, Src));
3154         })) {
3155       Worklist.insert(Src);
3156       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3157     }
3158   }
3159 
3160   // An induction variable will remain scalar if all users of the induction
3161   // variable and induction variable update remain scalar.
3162   for (const auto &Induction : Legal->getInductionVars()) {
3163     auto *Ind = Induction.first;
3164     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3165 
3166     // If tail-folding is applied, the primary induction variable will be used
3167     // to feed a vector compare.
3168     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3169       continue;
3170 
3171     // Returns true if \p Indvar is a pointer induction that is used directly by
3172     // load/store instruction \p I.
3173     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3174                                               Instruction *I) {
3175       return Induction.second.getKind() ==
3176                  InductionDescriptor::IK_PtrInduction &&
3177              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3178              Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3179     };
3180 
3181     // Determine if all users of the induction variable are scalar after
3182     // vectorization.
3183     bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3184       auto *I = cast<Instruction>(U);
3185       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3186              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3187     });
3188     if (!ScalarInd)
3189       continue;
3190 
3191     // If the induction variable update is a fixed-order recurrence, neither the
3192     // induction variable or its update should be marked scalar after
3193     // vectorization.
3194     auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3195     if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3196       continue;
3197 
3198     // Determine if all users of the induction variable update instruction are
3199     // scalar after vectorization.
3200     bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3201       auto *I = cast<Instruction>(U);
3202       return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3203              IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3204     });
3205     if (!ScalarIndUpdate)
3206       continue;
3207 
3208     // The induction variable and its update instruction will remain scalar.
3209     Worklist.insert(Ind);
3210     Worklist.insert(IndUpdate);
3211     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3212     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3213                       << "\n");
3214   }
3215 
3216   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3217 }
3218 
3219 bool LoopVectorizationCostModel::isScalarWithPredication(
3220     Instruction *I, ElementCount VF) const {
3221   if (!isPredicatedInst(I))
3222     return false;
3223 
3224   // Do we have a non-scalar lowering for this predicated
3225   // instruction? No - it is scalar with predication.
3226   switch(I->getOpcode()) {
3227   default:
3228     return true;
3229   case Instruction::Call:
3230     if (VF.isScalar())
3231       return true;
3232     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3233                .Kind == CM_Scalarize;
3234   case Instruction::Load:
3235   case Instruction::Store: {
3236     auto *Ptr = getLoadStorePointerOperand(I);
3237     auto *Ty = getLoadStoreType(I);
3238     Type *VTy = Ty;
3239     if (VF.isVector())
3240       VTy = VectorType::get(Ty, VF);
3241     const Align Alignment = getLoadStoreAlignment(I);
3242     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3243                                 TTI.isLegalMaskedGather(VTy, Alignment))
3244                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3245                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3246   }
3247   case Instruction::UDiv:
3248   case Instruction::SDiv:
3249   case Instruction::SRem:
3250   case Instruction::URem: {
3251     // We have the option to use the safe-divisor idiom to avoid predication.
3252     // The cost based decision here will always select safe-divisor for
3253     // scalable vectors as scalarization isn't legal.
3254     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3255     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3256   }
3257   }
3258 }
3259 
3260 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3261 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3262   // If predication is not needed, avoid it.
3263   // TODO: We can use the loop-preheader as context point here and get
3264   // context sensitive reasoning for isSafeToSpeculativelyExecute.
3265   if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3266       isSafeToSpeculativelyExecute(I) ||
3267       (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3268       isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3269     return false;
3270 
3271   // If the instruction was executed conditionally in the original scalar loop,
3272   // predication is needed with a mask whose lanes are all possibly inactive.
3273   if (Legal->blockNeedsPredication(I->getParent()))
3274     return true;
3275 
3276   // All that remain are instructions with side-effects originally executed in
3277   // the loop unconditionally, but now execute under a tail-fold mask (only)
3278   // having at least one active lane (the first). If the side-effects of the
3279   // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3280   // - it will cause the same side-effects as when masked.
3281   switch(I->getOpcode()) {
3282   default:
3283     llvm_unreachable(
3284         "instruction should have been considered by earlier checks");
3285   case Instruction::Call:
3286     // Side-effects of a Call are assumed to be non-invariant, needing a
3287     // (fold-tail) mask.
3288     assert(Legal->isMaskRequired(I) &&
3289            "should have returned earlier for calls not needing a mask");
3290     return true;
3291   case Instruction::Load:
3292     // If the address is loop invariant no predication is needed.
3293     return !Legal->isInvariant(getLoadStorePointerOperand(I));
3294   case Instruction::Store: {
3295     // For stores, we need to prove both speculation safety (which follows from
3296     // the same argument as loads), but also must prove the value being stored
3297     // is correct.  The easiest form of the later is to require that all values
3298     // stored are the same.
3299     return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3300              TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3301   }
3302   case Instruction::UDiv:
3303   case Instruction::SDiv:
3304   case Instruction::SRem:
3305   case Instruction::URem:
3306     // If the divisor is loop-invariant no predication is needed.
3307     return !TheLoop->isLoopInvariant(I->getOperand(1));
3308   }
3309 }
3310 
3311 std::pair<InstructionCost, InstructionCost>
3312 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3313                                                     ElementCount VF) const {
3314   assert(I->getOpcode() == Instruction::UDiv ||
3315          I->getOpcode() == Instruction::SDiv ||
3316          I->getOpcode() == Instruction::SRem ||
3317          I->getOpcode() == Instruction::URem);
3318   assert(!isSafeToSpeculativelyExecute(I));
3319 
3320   // Scalarization isn't legal for scalable vector types
3321   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3322   if (!VF.isScalable()) {
3323     // Get the scalarization cost and scale this amount by the probability of
3324     // executing the predicated block. If the instruction is not predicated,
3325     // we fall through to the next case.
3326     ScalarizationCost = 0;
3327 
3328     // These instructions have a non-void type, so account for the phi nodes
3329     // that we will create. This cost is likely to be zero. The phi node
3330     // cost, if any, should be scaled by the block probability because it
3331     // models a copy at the end of each predicated block.
3332     ScalarizationCost += VF.getKnownMinValue() *
3333       TTI.getCFInstrCost(Instruction::PHI, CostKind);
3334 
3335     // The cost of the non-predicated instruction.
3336     ScalarizationCost += VF.getKnownMinValue() *
3337       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3338 
3339     // The cost of insertelement and extractelement instructions needed for
3340     // scalarization.
3341     ScalarizationCost += getScalarizationOverhead(I, VF);
3342 
3343     // Scale the cost by the probability of executing the predicated blocks.
3344     // This assumes the predicated block for each vector lane is equally
3345     // likely.
3346     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3347   }
3348   InstructionCost SafeDivisorCost = 0;
3349 
3350   auto *VecTy = toVectorTy(I->getType(), VF);
3351 
3352   // The cost of the select guard to ensure all lanes are well defined
3353   // after we speculate above any internal control flow.
3354   SafeDivisorCost +=
3355       TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3356                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3357                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
3358 
3359   // Certain instructions can be cheaper to vectorize if they have a constant
3360   // second vector operand. One example of this are shifts on x86.
3361   Value *Op2 = I->getOperand(1);
3362   auto Op2Info = TTI.getOperandInfo(Op2);
3363   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3364       Legal->isInvariant(Op2))
3365     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3366 
3367   SmallVector<const Value *, 4> Operands(I->operand_values());
3368   SafeDivisorCost += TTI.getArithmeticInstrCost(
3369     I->getOpcode(), VecTy, CostKind,
3370     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3371     Op2Info, Operands, I);
3372   return {ScalarizationCost, SafeDivisorCost};
3373 }
3374 
3375 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3376     Instruction *I, ElementCount VF) const {
3377   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3378   assert(getWideningDecision(I, VF) == CM_Unknown &&
3379          "Decision should not be set yet.");
3380   auto *Group = getInterleavedAccessGroup(I);
3381   assert(Group && "Must have a group.");
3382   unsigned InterleaveFactor = Group->getFactor();
3383 
3384   // If the instruction's allocated size doesn't equal its type size, it
3385   // requires padding and will be scalarized.
3386   auto &DL = I->getDataLayout();
3387   auto *ScalarTy = getLoadStoreType(I);
3388   if (hasIrregularType(ScalarTy, DL))
3389     return false;
3390 
3391   // We currently only know how to emit interleave/deinterleave with
3392   // Factor=2 for scalable vectors. This is purely an implementation
3393   // limit.
3394   if (VF.isScalable() && InterleaveFactor != 2)
3395     return false;
3396 
3397   // If the group involves a non-integral pointer, we may not be able to
3398   // losslessly cast all values to a common type.
3399   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3400   for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3401     Instruction *Member = Group->getMember(Idx);
3402     if (!Member)
3403       continue;
3404     auto *MemberTy = getLoadStoreType(Member);
3405     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3406     // Don't coerce non-integral pointers to integers or vice versa.
3407     if (MemberNI != ScalarNI)
3408       // TODO: Consider adding special nullptr value case here
3409       return false;
3410     if (MemberNI && ScalarNI &&
3411         ScalarTy->getPointerAddressSpace() !=
3412             MemberTy->getPointerAddressSpace())
3413       return false;
3414   }
3415 
3416   // Check if masking is required.
3417   // A Group may need masking for one of two reasons: it resides in a block that
3418   // needs predication, or it was decided to use masking to deal with gaps
3419   // (either a gap at the end of a load-access that may result in a speculative
3420   // load, or any gaps in a store-access).
3421   bool PredicatedAccessRequiresMasking =
3422       blockNeedsPredicationForAnyReason(I->getParent()) &&
3423       Legal->isMaskRequired(I);
3424   bool LoadAccessWithGapsRequiresEpilogMasking =
3425       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3426       !isScalarEpilogueAllowed();
3427   bool StoreAccessWithGapsRequiresMasking =
3428       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3429   if (!PredicatedAccessRequiresMasking &&
3430       !LoadAccessWithGapsRequiresEpilogMasking &&
3431       !StoreAccessWithGapsRequiresMasking)
3432     return true;
3433 
3434   // If masked interleaving is required, we expect that the user/target had
3435   // enabled it, because otherwise it either wouldn't have been created or
3436   // it should have been invalidated by the CostModel.
3437   assert(useMaskedInterleavedAccesses(TTI) &&
3438          "Masked interleave-groups for predicated accesses are not enabled.");
3439 
3440   if (Group->isReverse())
3441     return false;
3442 
3443   auto *Ty = getLoadStoreType(I);
3444   const Align Alignment = getLoadStoreAlignment(I);
3445   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3446                           : TTI.isLegalMaskedStore(Ty, Alignment);
3447 }
3448 
3449 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3450     Instruction *I, ElementCount VF) {
3451   // Get and ensure we have a valid memory instruction.
3452   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3453 
3454   auto *Ptr = getLoadStorePointerOperand(I);
3455   auto *ScalarTy = getLoadStoreType(I);
3456 
3457   // In order to be widened, the pointer should be consecutive, first of all.
3458   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3459     return false;
3460 
3461   // If the instruction is a store located in a predicated block, it will be
3462   // scalarized.
3463   if (isScalarWithPredication(I, VF))
3464     return false;
3465 
3466   // If the instruction's allocated size doesn't equal it's type size, it
3467   // requires padding and will be scalarized.
3468   auto &DL = I->getDataLayout();
3469   if (hasIrregularType(ScalarTy, DL))
3470     return false;
3471 
3472   return true;
3473 }
3474 
3475 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3476   // We should not collect Uniforms more than once per VF. Right now,
3477   // this function is called from collectUniformsAndScalars(), which
3478   // already does this check. Collecting Uniforms for VF=1 does not make any
3479   // sense.
3480 
3481   assert(VF.isVector() && !Uniforms.contains(VF) &&
3482          "This function should not be visited twice for the same VF");
3483 
3484   // Visit the list of Uniforms. If we find no uniform value, we won't
3485   // analyze again.  Uniforms.count(VF) will return 1.
3486   Uniforms[VF].clear();
3487 
3488   // Now we know that the loop is vectorizable!
3489   // Collect instructions inside the loop that will remain uniform after
3490   // vectorization.
3491 
3492   // Global values, params and instructions outside of current loop are out of
3493   // scope.
3494   auto IsOutOfScope = [&](Value *V) -> bool {
3495     Instruction *I = dyn_cast<Instruction>(V);
3496     return (!I || !TheLoop->contains(I));
3497   };
3498 
3499   // Worklist containing uniform instructions demanding lane 0.
3500   SetVector<Instruction *> Worklist;
3501 
3502   // Add uniform instructions demanding lane 0 to the worklist. Instructions
3503   // that require predication must not be considered uniform after
3504   // vectorization, because that would create an erroneous replicating region
3505   // where only a single instance out of VF should be formed.
3506   auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3507     if (IsOutOfScope(I)) {
3508       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3509                         << *I << "\n");
3510       return;
3511     }
3512     if (isPredicatedInst(I)) {
3513       LLVM_DEBUG(
3514           dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3515                  << "\n");
3516       return;
3517     }
3518     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3519     Worklist.insert(I);
3520   };
3521 
3522   // Start with the conditional branches exiting the loop. If the branch
3523   // condition is an instruction contained in the loop that is only used by the
3524   // branch, it is uniform. Note conditions from uncountable early exits are not
3525   // uniform.
3526   SmallVector<BasicBlock *> Exiting;
3527   TheLoop->getExitingBlocks(Exiting);
3528   for (BasicBlock *E : Exiting) {
3529     if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3530       continue;
3531     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3532     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3533       AddToWorklistIfAllowed(Cmp);
3534   }
3535 
3536   auto PrevVF = VF.divideCoefficientBy(2);
3537   // Return true if all lanes perform the same memory operation, and we can
3538   // thus choose to execute only one.
3539   auto IsUniformMemOpUse = [&](Instruction *I) {
3540     // If the value was already known to not be uniform for the previous
3541     // (smaller VF), it cannot be uniform for the larger VF.
3542     if (PrevVF.isVector()) {
3543       auto Iter = Uniforms.find(PrevVF);
3544       if (Iter != Uniforms.end() && !Iter->second.contains(I))
3545         return false;
3546     }
3547     if (!Legal->isUniformMemOp(*I, VF))
3548       return false;
3549     if (isa<LoadInst>(I))
3550       // Loading the same address always produces the same result - at least
3551       // assuming aliasing and ordering which have already been checked.
3552       return true;
3553     // Storing the same value on every iteration.
3554     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3555   };
3556 
3557   auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3558     InstWidening WideningDecision = getWideningDecision(I, VF);
3559     assert(WideningDecision != CM_Unknown &&
3560            "Widening decision should be ready at this moment");
3561 
3562     if (IsUniformMemOpUse(I))
3563       return true;
3564 
3565     return (WideningDecision == CM_Widen ||
3566             WideningDecision == CM_Widen_Reverse ||
3567             WideningDecision == CM_Interleave);
3568   };
3569 
3570   // Returns true if Ptr is the pointer operand of a memory access instruction
3571   // I, I is known to not require scalarization, and the pointer is not also
3572   // stored.
3573   auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3574     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3575       return false;
3576     return getLoadStorePointerOperand(I) == Ptr &&
3577            (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3578   };
3579 
3580   // Holds a list of values which are known to have at least one uniform use.
3581   // Note that there may be other uses which aren't uniform.  A "uniform use"
3582   // here is something which only demands lane 0 of the unrolled iterations;
3583   // it does not imply that all lanes produce the same value (e.g. this is not
3584   // the usual meaning of uniform)
3585   SetVector<Value *> HasUniformUse;
3586 
3587   // Scan the loop for instructions which are either a) known to have only
3588   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3589   for (auto *BB : TheLoop->blocks())
3590     for (auto &I : *BB) {
3591       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3592         switch (II->getIntrinsicID()) {
3593         case Intrinsic::sideeffect:
3594         case Intrinsic::experimental_noalias_scope_decl:
3595         case Intrinsic::assume:
3596         case Intrinsic::lifetime_start:
3597         case Intrinsic::lifetime_end:
3598           if (TheLoop->hasLoopInvariantOperands(&I))
3599             AddToWorklistIfAllowed(&I);
3600           break;
3601         default:
3602           break;
3603         }
3604       }
3605 
3606       // ExtractValue instructions must be uniform, because the operands are
3607       // known to be loop-invariant.
3608       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3609         assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3610                "Expected aggregate value to be loop invariant");
3611         AddToWorklistIfAllowed(EVI);
3612         continue;
3613       }
3614 
3615       // If there's no pointer operand, there's nothing to do.
3616       auto *Ptr = getLoadStorePointerOperand(&I);
3617       if (!Ptr)
3618         continue;
3619 
3620       if (IsUniformMemOpUse(&I))
3621         AddToWorklistIfAllowed(&I);
3622 
3623       if (IsVectorizedMemAccessUse(&I, Ptr))
3624         HasUniformUse.insert(Ptr);
3625     }
3626 
3627   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3628   // demanding) users.  Since loops are assumed to be in LCSSA form, this
3629   // disallows uses outside the loop as well.
3630   for (auto *V : HasUniformUse) {
3631     if (IsOutOfScope(V))
3632       continue;
3633     auto *I = cast<Instruction>(V);
3634     bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3635       auto *UI = cast<Instruction>(U);
3636       return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3637     });
3638     if (UsersAreMemAccesses)
3639       AddToWorklistIfAllowed(I);
3640   }
3641 
3642   // Expand Worklist in topological order: whenever a new instruction
3643   // is added , its users should be already inside Worklist.  It ensures
3644   // a uniform instruction will only be used by uniform instructions.
3645   unsigned Idx = 0;
3646   while (Idx != Worklist.size()) {
3647     Instruction *I = Worklist[Idx++];
3648 
3649     for (auto *OV : I->operand_values()) {
3650       // isOutOfScope operands cannot be uniform instructions.
3651       if (IsOutOfScope(OV))
3652         continue;
3653       // First order recurrence Phi's should typically be considered
3654       // non-uniform.
3655       auto *OP = dyn_cast<PHINode>(OV);
3656       if (OP && Legal->isFixedOrderRecurrence(OP))
3657         continue;
3658       // If all the users of the operand are uniform, then add the
3659       // operand into the uniform worklist.
3660       auto *OI = cast<Instruction>(OV);
3661       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3662             auto *J = cast<Instruction>(U);
3663             return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3664           }))
3665         AddToWorklistIfAllowed(OI);
3666     }
3667   }
3668 
3669   // For an instruction to be added into Worklist above, all its users inside
3670   // the loop should also be in Worklist. However, this condition cannot be
3671   // true for phi nodes that form a cyclic dependence. We must process phi
3672   // nodes separately. An induction variable will remain uniform if all users
3673   // of the induction variable and induction variable update remain uniform.
3674   // The code below handles both pointer and non-pointer induction variables.
3675   BasicBlock *Latch = TheLoop->getLoopLatch();
3676   for (const auto &Induction : Legal->getInductionVars()) {
3677     auto *Ind = Induction.first;
3678     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3679 
3680     // Determine if all users of the induction variable are uniform after
3681     // vectorization.
3682     bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3683       auto *I = cast<Instruction>(U);
3684       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3685              IsVectorizedMemAccessUse(I, Ind);
3686     });
3687     if (!UniformInd)
3688       continue;
3689 
3690     // Determine if all users of the induction variable update instruction are
3691     // uniform after vectorization.
3692     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3693       auto *I = cast<Instruction>(U);
3694       return I == Ind || Worklist.count(I) ||
3695              IsVectorizedMemAccessUse(I, IndUpdate);
3696     });
3697     if (!UniformIndUpdate)
3698       continue;
3699 
3700     // The induction variable and its update instruction will remain uniform.
3701     AddToWorklistIfAllowed(Ind);
3702     AddToWorklistIfAllowed(IndUpdate);
3703   }
3704 
3705   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3706 }
3707 
3708 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3709   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3710 
3711   if (Legal->getRuntimePointerChecking()->Need) {
3712     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3713         "runtime pointer checks needed. Enable vectorization of this "
3714         "loop with '#pragma clang loop vectorize(enable)' when "
3715         "compiling with -Os/-Oz",
3716         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3717     return true;
3718   }
3719 
3720   if (!PSE.getPredicate().isAlwaysTrue()) {
3721     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3722         "runtime SCEV checks needed. Enable vectorization of this "
3723         "loop with '#pragma clang loop vectorize(enable)' when "
3724         "compiling with -Os/-Oz",
3725         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3726     return true;
3727   }
3728 
3729   // FIXME: Avoid specializing for stride==1 instead of bailing out.
3730   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3731     reportVectorizationFailure("Runtime stride check for small trip count",
3732         "runtime stride == 1 checks needed. Enable vectorization of "
3733         "this loop without such check by compiling with -Os/-Oz",
3734         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3735     return true;
3736   }
3737 
3738   return false;
3739 }
3740 
3741 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3742   if (IsScalableVectorizationAllowed)
3743     return *IsScalableVectorizationAllowed;
3744 
3745   IsScalableVectorizationAllowed = false;
3746   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3747     return false;
3748 
3749   if (Hints->isScalableVectorizationDisabled()) {
3750     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3751                             "ScalableVectorizationDisabled", ORE, TheLoop);
3752     return false;
3753   }
3754 
3755   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3756 
3757   auto MaxScalableVF = ElementCount::getScalable(
3758       std::numeric_limits<ElementCount::ScalarTy>::max());
3759 
3760   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3761   // FIXME: While for scalable vectors this is currently sufficient, this should
3762   // be replaced by a more detailed mechanism that filters out specific VFs,
3763   // instead of invalidating vectorization for a whole set of VFs based on the
3764   // MaxVF.
3765 
3766   // Disable scalable vectorization if the loop contains unsupported reductions.
3767   if (!canVectorizeReductions(MaxScalableVF)) {
3768     reportVectorizationInfo(
3769         "Scalable vectorization not supported for the reduction "
3770         "operations found in this loop.",
3771         "ScalableVFUnfeasible", ORE, TheLoop);
3772     return false;
3773   }
3774 
3775   // Disable scalable vectorization if the loop contains any instructions
3776   // with element types not supported for scalable vectors.
3777   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3778         return !Ty->isVoidTy() &&
3779                !this->TTI.isElementTypeLegalForScalableVector(Ty);
3780       })) {
3781     reportVectorizationInfo("Scalable vectorization is not supported "
3782                             "for all element types found in this loop.",
3783                             "ScalableVFUnfeasible", ORE, TheLoop);
3784     return false;
3785   }
3786 
3787   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3788     reportVectorizationInfo("The target does not provide maximum vscale value "
3789                             "for safe distance analysis.",
3790                             "ScalableVFUnfeasible", ORE, TheLoop);
3791     return false;
3792   }
3793 
3794   IsScalableVectorizationAllowed = true;
3795   return true;
3796 }
3797 
3798 ElementCount
3799 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3800   if (!isScalableVectorizationAllowed())
3801     return ElementCount::getScalable(0);
3802 
3803   auto MaxScalableVF = ElementCount::getScalable(
3804       std::numeric_limits<ElementCount::ScalarTy>::max());
3805   if (Legal->isSafeForAnyVectorWidth())
3806     return MaxScalableVF;
3807 
3808   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3809   // Limit MaxScalableVF by the maximum safe dependence distance.
3810   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3811 
3812   if (!MaxScalableVF)
3813     reportVectorizationInfo(
3814         "Max legal vector width too small, scalable vectorization "
3815         "unfeasible.",
3816         "ScalableVFUnfeasible", ORE, TheLoop);
3817 
3818   return MaxScalableVF;
3819 }
3820 
3821 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3822     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3823   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3824   unsigned SmallestType, WidestType;
3825   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3826 
3827   // Get the maximum safe dependence distance in bits computed by LAA.
3828   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3829   // the memory accesses that is most restrictive (involved in the smallest
3830   // dependence distance).
3831   unsigned MaxSafeElements =
3832       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3833 
3834   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3835   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3836   if (!Legal->isSafeForAnyVectorWidth())
3837     this->MaxSafeElements = MaxSafeElements;
3838 
3839   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3840                     << ".\n");
3841   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3842                     << ".\n");
3843 
3844   // First analyze the UserVF, fall back if the UserVF should be ignored.
3845   if (UserVF) {
3846     auto MaxSafeUserVF =
3847         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3848 
3849     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3850       // If `VF=vscale x N` is safe, then so is `VF=N`
3851       if (UserVF.isScalable())
3852         return FixedScalableVFPair(
3853             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3854 
3855       return UserVF;
3856     }
3857 
3858     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3859 
3860     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3861     // is better to ignore the hint and let the compiler choose a suitable VF.
3862     if (!UserVF.isScalable()) {
3863       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3864                         << " is unsafe, clamping to max safe VF="
3865                         << MaxSafeFixedVF << ".\n");
3866       ORE->emit([&]() {
3867         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3868                                           TheLoop->getStartLoc(),
3869                                           TheLoop->getHeader())
3870                << "User-specified vectorization factor "
3871                << ore::NV("UserVectorizationFactor", UserVF)
3872                << " is unsafe, clamping to maximum safe vectorization factor "
3873                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3874       });
3875       return MaxSafeFixedVF;
3876     }
3877 
3878     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3879       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3880                         << " is ignored because scalable vectors are not "
3881                            "available.\n");
3882       ORE->emit([&]() {
3883         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3884                                           TheLoop->getStartLoc(),
3885                                           TheLoop->getHeader())
3886                << "User-specified vectorization factor "
3887                << ore::NV("UserVectorizationFactor", UserVF)
3888                << " is ignored because the target does not support scalable "
3889                   "vectors. The compiler will pick a more suitable value.";
3890       });
3891     } else {
3892       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3893                         << " is unsafe. Ignoring scalable UserVF.\n");
3894       ORE->emit([&]() {
3895         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3896                                           TheLoop->getStartLoc(),
3897                                           TheLoop->getHeader())
3898                << "User-specified vectorization factor "
3899                << ore::NV("UserVectorizationFactor", UserVF)
3900                << " is unsafe. Ignoring the hint to let the compiler pick a "
3901                   "more suitable value.";
3902       });
3903     }
3904   }
3905 
3906   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3907                     << " / " << WidestType << " bits.\n");
3908 
3909   FixedScalableVFPair Result(ElementCount::getFixed(1),
3910                              ElementCount::getScalable(0));
3911   if (auto MaxVF =
3912           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3913                                   MaxSafeFixedVF, FoldTailByMasking))
3914     Result.FixedVF = MaxVF;
3915 
3916   if (auto MaxVF =
3917           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3918                                   MaxSafeScalableVF, FoldTailByMasking))
3919     if (MaxVF.isScalable()) {
3920       Result.ScalableVF = MaxVF;
3921       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3922                         << "\n");
3923     }
3924 
3925   return Result;
3926 }
3927 
3928 FixedScalableVFPair
3929 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3930   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3931     // TODO: It may be useful to do since it's still likely to be dynamically
3932     // uniform if the target can skip.
3933     reportVectorizationFailure(
3934         "Not inserting runtime ptr check for divergent target",
3935         "runtime pointer checks needed. Not enabled for divergent target",
3936         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3937     return FixedScalableVFPair::getNone();
3938   }
3939 
3940   ScalarEvolution *SE = PSE.getSE();
3941   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
3942   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3943   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3944   if (TC != MaxTC)
3945     LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3946   if (TC == 1) {
3947     reportVectorizationFailure("Single iteration (non) loop",
3948         "loop trip count is one, irrelevant for vectorization",
3949         "SingleIterationLoop", ORE, TheLoop);
3950     return FixedScalableVFPair::getNone();
3951   }
3952 
3953   // If BTC matches the widest induction type and is -1 then the trip count
3954   // computation will wrap to 0 and the vector trip count will be 0. Do not try
3955   // to vectorize.
3956   const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3957   if (!isa<SCEVCouldNotCompute>(BTC) &&
3958       BTC->getType()->getScalarSizeInBits() >=
3959           Legal->getWidestInductionType()->getScalarSizeInBits() &&
3960       SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,
3961                            SE->getMinusOne(BTC->getType()))) {
3962     reportVectorizationFailure(
3963         "Trip count computation wrapped",
3964         "backedge-taken count is -1, loop trip count wrapped to 0",
3965         "TripCountWrapped", ORE, TheLoop);
3966     return FixedScalableVFPair::getNone();
3967   }
3968 
3969   switch (ScalarEpilogueStatus) {
3970   case CM_ScalarEpilogueAllowed:
3971     return computeFeasibleMaxVF(MaxTC, UserVF, false);
3972   case CM_ScalarEpilogueNotAllowedUsePredicate:
3973     [[fallthrough]];
3974   case CM_ScalarEpilogueNotNeededUsePredicate:
3975     LLVM_DEBUG(
3976         dbgs() << "LV: vector predicate hint/switch found.\n"
3977                << "LV: Not allowing scalar epilogue, creating predicated "
3978                << "vector loop.\n");
3979     break;
3980   case CM_ScalarEpilogueNotAllowedLowTripLoop:
3981     // fallthrough as a special case of OptForSize
3982   case CM_ScalarEpilogueNotAllowedOptSize:
3983     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3984       LLVM_DEBUG(
3985           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3986     else
3987       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3988                         << "count.\n");
3989 
3990     // Bail if runtime checks are required, which are not good when optimising
3991     // for size.
3992     if (runtimeChecksRequired())
3993       return FixedScalableVFPair::getNone();
3994 
3995     break;
3996   }
3997 
3998   // The only loops we can vectorize without a scalar epilogue, are loops with
3999   // a bottom-test and a single exiting block. We'd have to handle the fact
4000   // that not every instruction executes on the last iteration.  This will
4001   // require a lane mask which varies through the vector loop body.  (TODO)
4002   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4003     // If there was a tail-folding hint/switch, but we can't fold the tail by
4004     // masking, fallback to a vectorization with a scalar epilogue.
4005     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4006       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4007                            "scalar epilogue instead.\n");
4008       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4009       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4010     }
4011     return FixedScalableVFPair::getNone();
4012   }
4013 
4014   // Now try the tail folding
4015 
4016   // Invalidate interleave groups that require an epilogue if we can't mask
4017   // the interleave-group.
4018   if (!useMaskedInterleavedAccesses(TTI)) {
4019     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4020            "No decisions should have been taken at this point");
4021     // Note: There is no need to invalidate any cost modeling decisions here, as
4022     // none were taken so far.
4023     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4024   }
4025 
4026   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4027 
4028   // Avoid tail folding if the trip count is known to be a multiple of any VF
4029   // we choose.
4030   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4031       MaxFactors.FixedVF.getFixedValue();
4032   if (MaxFactors.ScalableVF) {
4033     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4034     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4035       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4036           *MaxPowerOf2RuntimeVF,
4037           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4038     } else
4039       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4040   }
4041 
4042   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4043     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4044            "MaxFixedVF must be a power of 2");
4045     unsigned MaxVFtimesIC =
4046         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4047     ScalarEvolution *SE = PSE.getSE();
4048     // Currently only loops with countable exits are vectorized, but calling
4049     // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4050     // uncountable exits whilst also ensuring the symbolic maximum and known
4051     // back-edge taken count remain identical for loops with countable exits.
4052     const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4053     assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4054            "Invalid loop count");
4055     const SCEV *ExitCount = SE->getAddExpr(
4056         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4057     const SCEV *Rem = SE->getURemExpr(
4058         SE->applyLoopGuards(ExitCount, TheLoop),
4059         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4060     if (Rem->isZero()) {
4061       // Accept MaxFixedVF if we do not have a tail.
4062       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4063       return MaxFactors;
4064     }
4065   }
4066 
4067   // If we don't know the precise trip count, or if the trip count that we
4068   // found modulo the vectorization factor is not zero, try to fold the tail
4069   // by masking.
4070   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4071   setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4072   if (foldTailByMasking()) {
4073     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4074       LLVM_DEBUG(
4075           dbgs()
4076           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4077              "try to generate VP Intrinsics with scalable vector "
4078              "factors only.\n");
4079       // Tail folded loop using VP intrinsics restricts the VF to be scalable
4080       // for now.
4081       // TODO: extend it for fixed vectors, if required.
4082       assert(MaxFactors.ScalableVF.isScalable() &&
4083              "Expected scalable vector factor.");
4084 
4085       MaxFactors.FixedVF = ElementCount::getFixed(1);
4086     }
4087     return MaxFactors;
4088   }
4089 
4090   // If there was a tail-folding hint/switch, but we can't fold the tail by
4091   // masking, fallback to a vectorization with a scalar epilogue.
4092   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4093     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4094                          "scalar epilogue instead.\n");
4095     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4096     return MaxFactors;
4097   }
4098 
4099   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4100     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4101     return FixedScalableVFPair::getNone();
4102   }
4103 
4104   if (TC == 0) {
4105     reportVectorizationFailure(
4106         "unable to calculate the loop count due to complex control flow",
4107         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4108     return FixedScalableVFPair::getNone();
4109   }
4110 
4111   reportVectorizationFailure(
4112       "Cannot optimize for size and vectorize at the same time.",
4113       "cannot optimize for size and vectorize at the same time. "
4114       "Enable vectorization of this loop with '#pragma clang loop "
4115       "vectorize(enable)' when compiling with -Os/-Oz",
4116       "NoTailLoopWithOptForSize", ORE, TheLoop);
4117   return FixedScalableVFPair::getNone();
4118 }
4119 
4120 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4121     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4122     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4123   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4124   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4125       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4126                            : TargetTransformInfo::RGK_FixedWidthVector);
4127 
4128   // Convenience function to return the minimum of two ElementCounts.
4129   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4130     assert((LHS.isScalable() == RHS.isScalable()) &&
4131            "Scalable flags must match");
4132     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4133   };
4134 
4135   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4136   // Note that both WidestRegister and WidestType may not be a powers of 2.
4137   auto MaxVectorElementCount = ElementCount::get(
4138       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4139       ComputeScalableMaxVF);
4140   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4141   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4142                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4143 
4144   if (!MaxVectorElementCount) {
4145     LLVM_DEBUG(dbgs() << "LV: The target has no "
4146                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4147                       << " vector registers.\n");
4148     return ElementCount::getFixed(1);
4149   }
4150 
4151   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4152   if (MaxVectorElementCount.isScalable() &&
4153       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4154     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4155     auto Min = Attr.getVScaleRangeMin();
4156     WidestRegisterMinEC *= Min;
4157   }
4158 
4159   // When a scalar epilogue is required, at least one iteration of the scalar
4160   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4161   // max VF that results in a dead vector loop.
4162   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4163     MaxTripCount -= 1;
4164 
4165   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4166       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4167     // If upper bound loop trip count (TC) is known at compile time there is no
4168     // point in choosing VF greater than TC (as done in the loop below). Select
4169     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4170     // scalable, we only fall back on a fixed VF when the TC is less than or
4171     // equal to the known number of lanes.
4172     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4173     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4174                          "exceeding the constant trip count: "
4175                       << ClampedUpperTripCount << "\n");
4176     return ElementCount::get(
4177         ClampedUpperTripCount,
4178         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4179   }
4180 
4181   TargetTransformInfo::RegisterKind RegKind =
4182       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4183                            : TargetTransformInfo::RGK_FixedWidthVector;
4184   ElementCount MaxVF = MaxVectorElementCount;
4185   if (MaximizeBandwidth ||
4186       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4187        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4188         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4189     auto MaxVectorElementCountMaxBW = ElementCount::get(
4190         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4191         ComputeScalableMaxVF);
4192     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4193 
4194     // Collect all viable vectorization factors larger than the default MaxVF
4195     // (i.e. MaxVectorElementCount).
4196     SmallVector<ElementCount, 8> VFs;
4197     for (ElementCount VS = MaxVectorElementCount * 2;
4198          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4199       VFs.push_back(VS);
4200 
4201     // For each VF calculate its register usage.
4202     auto RUs = calculateRegisterUsage(VFs);
4203 
4204     // Select the largest VF which doesn't require more registers than existing
4205     // ones.
4206     for (int I = RUs.size() - 1; I >= 0; --I) {
4207       const auto &MLU = RUs[I].MaxLocalUsers;
4208       if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4209             return LU.second <= TTI.getNumberOfRegisters(LU.first);
4210           })) {
4211         MaxVF = VFs[I];
4212         break;
4213       }
4214     }
4215     if (ElementCount MinVF =
4216             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4217       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4218         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4219                           << ") with target's minimum: " << MinVF << '\n');
4220         MaxVF = MinVF;
4221       }
4222     }
4223 
4224     // Invalidate any widening decisions we might have made, in case the loop
4225     // requires prediction (decided later), but we have already made some
4226     // load/store widening decisions.
4227     invalidateCostModelingDecisions();
4228   }
4229   return MaxVF;
4230 }
4231 
4232 /// Convenience function that returns the value of vscale_range iff
4233 /// vscale_range.min == vscale_range.max or otherwise returns the value
4234 /// returned by the corresponding TTI method.
4235 static std::optional<unsigned>
4236 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4237   const Function *Fn = L->getHeader()->getParent();
4238   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4239     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4240     auto Min = Attr.getVScaleRangeMin();
4241     auto Max = Attr.getVScaleRangeMax();
4242     if (Max && Min == Max)
4243       return Max;
4244   }
4245 
4246   return TTI.getVScaleForTuning();
4247 }
4248 
4249 /// This function attempts to return a value that represents the vectorization
4250 /// factor at runtime. For fixed-width VFs we know this precisely at compile
4251 /// time, but for scalable VFs we calculate it based on an estimate of the
4252 /// vscale value.
4253 static unsigned getEstimatedRuntimeVF(const Loop *L,
4254                                       const TargetTransformInfo &TTI,
4255                                       ElementCount VF) {
4256   unsigned EstimatedVF = VF.getKnownMinValue();
4257   if (VF.isScalable())
4258     if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4259       EstimatedVF *= *VScale;
4260   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4261   return EstimatedVF;
4262 }
4263 
4264 bool LoopVectorizationPlanner::isMoreProfitable(
4265     const VectorizationFactor &A, const VectorizationFactor &B,
4266     const unsigned MaxTripCount) const {
4267   InstructionCost CostA = A.Cost;
4268   InstructionCost CostB = B.Cost;
4269 
4270   // Improve estimate for the vector width if it is scalable.
4271   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4272   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4273   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4274     if (A.Width.isScalable())
4275       EstimatedWidthA *= *VScale;
4276     if (B.Width.isScalable())
4277       EstimatedWidthB *= *VScale;
4278   }
4279 
4280   // Assume vscale may be larger than 1 (or the value being tuned for),
4281   // so that scalable vectorization is slightly favorable over fixed-width
4282   // vectorization.
4283   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4284                         A.Width.isScalable() && !B.Width.isScalable();
4285 
4286   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4287                                 const InstructionCost &RHS) {
4288     return PreferScalable ? LHS <= RHS : LHS < RHS;
4289   };
4290 
4291   // To avoid the need for FP division:
4292   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4293   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4294   if (!MaxTripCount)
4295     return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4296 
4297   auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4298                                            InstructionCost VectorCost,
4299                                            InstructionCost ScalarCost) {
4300     // If the trip count is a known (possibly small) constant, the trip count
4301     // will be rounded up to an integer number of iterations under
4302     // FoldTailByMasking. The total cost in that case will be
4303     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4304     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4305     // some extra overheads, but for the purpose of comparing the costs of
4306     // different VFs we can use this to compare the total loop-body cost
4307     // expected after vectorization.
4308     if (CM.foldTailByMasking())
4309       return VectorCost * divideCeil(MaxTripCount, VF);
4310     return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4311   };
4312 
4313   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4314   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4315   return CmpFn(RTCostA, RTCostB);
4316 }
4317 
4318 bool LoopVectorizationPlanner::isMoreProfitable(
4319     const VectorizationFactor &A, const VectorizationFactor &B) const {
4320   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4321   return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4322 }
4323 
4324 void LoopVectorizationPlanner::emitInvalidCostRemarks(
4325     OptimizationRemarkEmitter *ORE) {
4326   using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4327   SmallVector<RecipeVFPair> InvalidCosts;
4328   for (const auto &Plan : VPlans) {
4329     for (ElementCount VF : Plan->vectorFactors()) {
4330       VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4331                             CM, CM.CostKind);
4332       precomputeCosts(*Plan, VF, CostCtx);
4333       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4334       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4335         for (auto &R : *VPBB) {
4336           if (!R.cost(VF, CostCtx).isValid())
4337             InvalidCosts.emplace_back(&R, VF);
4338         }
4339       }
4340     }
4341   }
4342   if (InvalidCosts.empty())
4343     return;
4344 
4345   // Emit a report of VFs with invalid costs in the loop.
4346 
4347   // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4348   DenseMap<VPRecipeBase *, unsigned> Numbering;
4349   unsigned I = 0;
4350   for (auto &Pair : InvalidCosts)
4351     if (!Numbering.count(Pair.first))
4352       Numbering[Pair.first] = I++;
4353 
4354   // Sort the list, first on recipe(number) then on VF.
4355   sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4356     if (Numbering[A.first] != Numbering[B.first])
4357       return Numbering[A.first] < Numbering[B.first];
4358     const auto &LHS = A.second;
4359     const auto &RHS = B.second;
4360     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4361            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4362   });
4363 
4364   // For a list of ordered recipe-VF pairs:
4365   //   [(load, VF1), (load, VF2), (store, VF1)]
4366   // group the recipes together to emit separate remarks for:
4367   //   load  (VF1, VF2)
4368   //   store (VF1)
4369   auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4370   auto Subset = ArrayRef<RecipeVFPair>();
4371   do {
4372     if (Subset.empty())
4373       Subset = Tail.take_front(1);
4374 
4375     VPRecipeBase *R = Subset.front().first;
4376 
4377     unsigned Opcode =
4378         TypeSwitch<const VPRecipeBase *, unsigned>(R)
4379             .Case<VPHeaderPHIRecipe>(
4380                 [](const auto *R) { return Instruction::PHI; })
4381             .Case<VPWidenSelectRecipe>(
4382                 [](const auto *R) { return Instruction::Select; })
4383             .Case<VPWidenStoreRecipe>(
4384                 [](const auto *R) { return Instruction::Store; })
4385             .Case<VPWidenLoadRecipe>(
4386                 [](const auto *R) { return Instruction::Load; })
4387             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4388                 [](const auto *R) { return Instruction::Call; })
4389             .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4390                   VPWidenCastRecipe>(
4391                 [](const auto *R) { return R->getOpcode(); })
4392             .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4393               return R->getStoredValues().empty() ? Instruction::Load
4394                                                   : Instruction::Store;
4395             });
4396 
4397     // If the next recipe is different, or if there are no other pairs,
4398     // emit a remark for the collated subset. e.g.
4399     //   [(load, VF1), (load, VF2))]
4400     // to emit:
4401     //  remark: invalid costs for 'load' at VF=(VF1, VF2)
4402     if (Subset == Tail || Tail[Subset.size()].first != R) {
4403       std::string OutString;
4404       raw_string_ostream OS(OutString);
4405       assert(!Subset.empty() && "Unexpected empty range");
4406       OS << "Recipe with invalid costs prevented vectorization at VF=(";
4407       for (const auto &Pair : Subset)
4408         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4409       OS << "):";
4410       if (Opcode == Instruction::Call) {
4411         StringRef Name = "";
4412         if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4413           Name = Int->getIntrinsicName();
4414         } else {
4415           auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4416           Function *CalledFn =
4417               WidenCall ? WidenCall->getCalledScalarFunction()
4418                         : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4419                                              ->getLiveInIRValue());
4420           Name = CalledFn->getName();
4421         }
4422         OS << " call to " << Name;
4423       } else
4424         OS << " " << Instruction::getOpcodeName(Opcode);
4425       reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4426                               R->getDebugLoc());
4427       Tail = Tail.drop_front(Subset.size());
4428       Subset = {};
4429     } else
4430       // Grow the subset by one element
4431       Subset = Tail.take_front(Subset.size() + 1);
4432   } while (!Tail.empty());
4433 }
4434 
4435 /// Check if any recipe of \p Plan will generate a vector value, which will be
4436 /// assigned a vector register.
4437 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4438                                 const TargetTransformInfo &TTI) {
4439   assert(VF.isVector() && "Checking a scalar VF?");
4440   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4441   DenseSet<VPRecipeBase *> EphemeralRecipes;
4442   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4443   // Set of already visited types.
4444   DenseSet<Type *> Visited;
4445   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4446            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4447     for (VPRecipeBase &R : *VPBB) {
4448       if (EphemeralRecipes.contains(&R))
4449         continue;
4450       // Continue early if the recipe is considered to not produce a vector
4451       // result. Note that this includes VPInstruction where some opcodes may
4452       // produce a vector, to preserve existing behavior as VPInstructions model
4453       // aspects not directly mapped to existing IR instructions.
4454       switch (R.getVPDefID()) {
4455       case VPDef::VPDerivedIVSC:
4456       case VPDef::VPScalarIVStepsSC:
4457       case VPDef::VPScalarCastSC:
4458       case VPDef::VPReplicateSC:
4459       case VPDef::VPInstructionSC:
4460       case VPDef::VPCanonicalIVPHISC:
4461       case VPDef::VPVectorPointerSC:
4462       case VPDef::VPReverseVectorPointerSC:
4463       case VPDef::VPExpandSCEVSC:
4464       case VPDef::VPEVLBasedIVPHISC:
4465       case VPDef::VPPredInstPHISC:
4466       case VPDef::VPBranchOnMaskSC:
4467         continue;
4468       case VPDef::VPReductionSC:
4469       case VPDef::VPActiveLaneMaskPHISC:
4470       case VPDef::VPWidenCallSC:
4471       case VPDef::VPWidenCanonicalIVSC:
4472       case VPDef::VPWidenCastSC:
4473       case VPDef::VPWidenGEPSC:
4474       case VPDef::VPWidenIntrinsicSC:
4475       case VPDef::VPWidenSC:
4476       case VPDef::VPWidenSelectSC:
4477       case VPDef::VPBlendSC:
4478       case VPDef::VPFirstOrderRecurrencePHISC:
4479       case VPDef::VPWidenPHISC:
4480       case VPDef::VPWidenIntOrFpInductionSC:
4481       case VPDef::VPWidenPointerInductionSC:
4482       case VPDef::VPReductionPHISC:
4483       case VPDef::VPInterleaveSC:
4484       case VPDef::VPWidenLoadEVLSC:
4485       case VPDef::VPWidenLoadSC:
4486       case VPDef::VPWidenStoreEVLSC:
4487       case VPDef::VPWidenStoreSC:
4488         break;
4489       default:
4490         llvm_unreachable("unhandled recipe");
4491       }
4492 
4493       auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4494         Type *VectorTy = toVectorTy(ScalarTy, VF);
4495         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4496         if (!NumLegalParts)
4497           return false;
4498         if (VF.isScalable()) {
4499           // <vscale x 1 x iN> is assumed to be profitable over iN because
4500           // scalable registers are a distinct register class from scalar
4501           // ones. If we ever find a target which wants to lower scalable
4502           // vectors back to scalars, we'll need to update this code to
4503           // explicitly ask TTI about the register class uses for each part.
4504           return NumLegalParts <= VF.getKnownMinValue();
4505         }
4506         // Two or more parts that share a register - are vectorized.
4507         return NumLegalParts < VF.getKnownMinValue();
4508       };
4509 
4510       // If no def nor is a store, e.g., branches, continue - no value to check.
4511       if (R.getNumDefinedValues() == 0 &&
4512           !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4513               &R))
4514         continue;
4515       // For multi-def recipes, currently only interleaved loads, suffice to
4516       // check first def only.
4517       // For stores check their stored value; for interleaved stores suffice
4518       // the check first stored value only. In all cases this is the second
4519       // operand.
4520       VPValue *ToCheck =
4521           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4522       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4523       if (!Visited.insert({ScalarTy}).second)
4524         continue;
4525       if (WillWiden(ScalarTy))
4526         return true;
4527     }
4528   }
4529 
4530   return false;
4531 }
4532 
4533 #ifndef NDEBUG
4534 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4535   InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4536   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4537   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4538   assert(any_of(VPlans,
4539                 [](std::unique_ptr<VPlan> &P) {
4540                   return P->hasVF(ElementCount::getFixed(1));
4541                 }) &&
4542          "Expected Scalar VF to be a candidate");
4543 
4544   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4545                                        ExpectedCost);
4546   VectorizationFactor ChosenFactor = ScalarCost;
4547 
4548   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4549   if (ForceVectorization &&
4550       (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4551     // Ignore scalar width, because the user explicitly wants vectorization.
4552     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4553     // evaluation.
4554     ChosenFactor.Cost = InstructionCost::getMax();
4555   }
4556 
4557   for (auto &P : VPlans) {
4558     for (ElementCount VF : P->vectorFactors()) {
4559       // The cost for scalar VF=1 is already calculated, so ignore it.
4560       if (VF.isScalar())
4561         continue;
4562 
4563       InstructionCost C = CM.expectedCost(VF);
4564       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4565 
4566       unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4567       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4568                         << " costs: " << (Candidate.Cost / Width));
4569       if (VF.isScalable())
4570         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4571                           << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4572                           << ")");
4573       LLVM_DEBUG(dbgs() << ".\n");
4574 
4575       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4576         LLVM_DEBUG(
4577             dbgs()
4578             << "LV: Not considering vector loop of width " << VF
4579             << " because it will not generate any vector instructions.\n");
4580         continue;
4581       }
4582 
4583       if (isMoreProfitable(Candidate, ChosenFactor))
4584         ChosenFactor = Candidate;
4585     }
4586   }
4587 
4588   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4589     reportVectorizationFailure(
4590         "There are conditional stores.",
4591         "store that is conditionally executed prevents vectorization",
4592         "ConditionalStore", ORE, OrigLoop);
4593     ChosenFactor = ScalarCost;
4594   }
4595 
4596   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4597                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4598              << "LV: Vectorization seems to be not beneficial, "
4599              << "but was forced by a user.\n");
4600   return ChosenFactor;
4601 }
4602 #endif
4603 
4604 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4605     ElementCount VF) const {
4606   // Cross iteration phis such as reductions need special handling and are
4607   // currently unsupported.
4608   if (any_of(OrigLoop->getHeader()->phis(),
4609              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4610     return false;
4611 
4612   // Phis with uses outside of the loop require special handling and are
4613   // currently unsupported.
4614   for (const auto &Entry : Legal->getInductionVars()) {
4615     // Look for uses of the value of the induction at the last iteration.
4616     Value *PostInc =
4617         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4618     for (User *U : PostInc->users())
4619       if (!OrigLoop->contains(cast<Instruction>(U)))
4620         return false;
4621     // Look for uses of penultimate value of the induction.
4622     for (User *U : Entry.first->users())
4623       if (!OrigLoop->contains(cast<Instruction>(U)))
4624         return false;
4625   }
4626 
4627   // Epilogue vectorization code has not been auditted to ensure it handles
4628   // non-latch exits properly.  It may be fine, but it needs auditted and
4629   // tested.
4630   // TODO: Add support for loops with an early exit.
4631   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4632     return false;
4633 
4634   return true;
4635 }
4636 
4637 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4638     const ElementCount VF, const unsigned IC) const {
4639   // FIXME: We need a much better cost-model to take different parameters such
4640   // as register pressure, code size increase and cost of extra branches into
4641   // account. For now we apply a very crude heuristic and only consider loops
4642   // with vectorization factors larger than a certain value.
4643 
4644   // Allow the target to opt out entirely.
4645   if (!TTI.preferEpilogueVectorization())
4646     return false;
4647 
4648   // We also consider epilogue vectorization unprofitable for targets that don't
4649   // consider interleaving beneficial (eg. MVE).
4650   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4651     return false;
4652 
4653   // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4654   // VFs when deciding profitability.
4655   // See related "TODO: extend to support scalable VFs." in
4656   // selectEpilogueVectorizationFactor.
4657   unsigned Multiplier = VF.isFixed() ? IC : 1;
4658   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4659                                 ? EpilogueVectorizationMinVF
4660                                 : TTI.getEpilogueVectorizationMinVF();
4661   return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4662 }
4663 
4664 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4665     const ElementCount MainLoopVF, unsigned IC) {
4666   VectorizationFactor Result = VectorizationFactor::Disabled();
4667   if (!EnableEpilogueVectorization) {
4668     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4669     return Result;
4670   }
4671 
4672   if (!CM.isScalarEpilogueAllowed()) {
4673     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4674                          "epilogue is allowed.\n");
4675     return Result;
4676   }
4677 
4678   // Not really a cost consideration, but check for unsupported cases here to
4679   // simplify the logic.
4680   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4681     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4682                          "is not a supported candidate.\n");
4683     return Result;
4684   }
4685 
4686   if (EpilogueVectorizationForceVF > 1) {
4687     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4688     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4689     if (hasPlanWithVF(ForcedEC))
4690       return {ForcedEC, 0, 0};
4691 
4692     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4693                          "viable.\n");
4694     return Result;
4695   }
4696 
4697   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4698       OrigLoop->getHeader()->getParent()->hasMinSize()) {
4699     LLVM_DEBUG(
4700         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4701     return Result;
4702   }
4703 
4704   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4705     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4706                          "this loop\n");
4707     return Result;
4708   }
4709 
4710   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4711   // the main loop handles 8 lanes per iteration. We could still benefit from
4712   // vectorizing the epilogue loop with VF=4.
4713   ElementCount EstimatedRuntimeVF =
4714       ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4715 
4716   ScalarEvolution &SE = *PSE.getSE();
4717   Type *TCType = Legal->getWidestInductionType();
4718   const SCEV *RemainingIterations = nullptr;
4719   unsigned MaxTripCount = 0;
4720   for (auto &NextVF : ProfitableVFs) {
4721     // Skip candidate VFs without a corresponding VPlan.
4722     if (!hasPlanWithVF(NextVF.Width))
4723       continue;
4724 
4725     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4726     // vectors) or > the VF of the main loop (fixed vectors).
4727     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4728          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4729         (NextVF.Width.isScalable() &&
4730          ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4731         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4732          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4733       continue;
4734 
4735     // If NextVF is greater than the number of remaining iterations, the
4736     // epilogue loop would be dead. Skip such factors.
4737     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4738       // TODO: extend to support scalable VFs.
4739       if (!RemainingIterations) {
4740         const SCEV *TC = vputils::getSCEVExprForVPValue(
4741             getPlanFor(NextVF.Width).getTripCount(), SE);
4742         assert(!isa<SCEVCouldNotCompute>(TC) &&
4743                "Trip count SCEV must be computable");
4744         RemainingIterations = SE.getURemExpr(
4745             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4746         MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4747         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4748                                 SE.getConstant(TCType, MaxTripCount))) {
4749           MaxTripCount =
4750               SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4751         }
4752         LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4753                           << MaxTripCount << "\n");
4754       }
4755       if (SE.isKnownPredicate(
4756               CmpInst::ICMP_UGT,
4757               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4758               RemainingIterations))
4759         continue;
4760     }
4761 
4762     if (Result.Width.isScalar() ||
4763         isMoreProfitable(NextVF, Result, MaxTripCount))
4764       Result = NextVF;
4765   }
4766 
4767   if (Result != VectorizationFactor::Disabled())
4768     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4769                       << Result.Width << "\n");
4770   return Result;
4771 }
4772 
4773 std::pair<unsigned, unsigned>
4774 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4775   unsigned MinWidth = -1U;
4776   unsigned MaxWidth = 8;
4777   const DataLayout &DL = TheFunction->getDataLayout();
4778   // For in-loop reductions, no element types are added to ElementTypesInLoop
4779   // if there are no loads/stores in the loop. In this case, check through the
4780   // reduction variables to determine the maximum width.
4781   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4782     // Reset MaxWidth so that we can find the smallest type used by recurrences
4783     // in the loop.
4784     MaxWidth = -1U;
4785     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4786       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4787       // When finding the min width used by the recurrence we need to account
4788       // for casts on the input operands of the recurrence.
4789       MaxWidth = std::min<unsigned>(
4790           MaxWidth, std::min<unsigned>(
4791                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4792                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4793     }
4794   } else {
4795     for (Type *T : ElementTypesInLoop) {
4796       MinWidth = std::min<unsigned>(
4797           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4798       MaxWidth = std::max<unsigned>(
4799           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4800     }
4801   }
4802   return {MinWidth, MaxWidth};
4803 }
4804 
4805 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4806   ElementTypesInLoop.clear();
4807   // For each block.
4808   for (BasicBlock *BB : TheLoop->blocks()) {
4809     // For each instruction in the loop.
4810     for (Instruction &I : BB->instructionsWithoutDebug()) {
4811       Type *T = I.getType();
4812 
4813       // Skip ignored values.
4814       if (ValuesToIgnore.count(&I))
4815         continue;
4816 
4817       // Only examine Loads, Stores and PHINodes.
4818       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4819         continue;
4820 
4821       // Examine PHI nodes that are reduction variables. Update the type to
4822       // account for the recurrence type.
4823       if (auto *PN = dyn_cast<PHINode>(&I)) {
4824         if (!Legal->isReductionVariable(PN))
4825           continue;
4826         const RecurrenceDescriptor &RdxDesc =
4827             Legal->getReductionVars().find(PN)->second;
4828         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4829             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
4830                                       RdxDesc.getRecurrenceType(),
4831                                       TargetTransformInfo::ReductionFlags()))
4832           continue;
4833         T = RdxDesc.getRecurrenceType();
4834       }
4835 
4836       // Examine the stored values.
4837       if (auto *ST = dyn_cast<StoreInst>(&I))
4838         T = ST->getValueOperand()->getType();
4839 
4840       assert(T->isSized() &&
4841              "Expected the load/store/recurrence type to be sized");
4842 
4843       ElementTypesInLoop.insert(T);
4844     }
4845   }
4846 }
4847 
4848 unsigned
4849 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4850                                                   InstructionCost LoopCost) {
4851   // -- The interleave heuristics --
4852   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4853   // There are many micro-architectural considerations that we can't predict
4854   // at this level. For example, frontend pressure (on decode or fetch) due to
4855   // code size, or the number and capabilities of the execution ports.
4856   //
4857   // We use the following heuristics to select the interleave count:
4858   // 1. If the code has reductions, then we interleave to break the cross
4859   // iteration dependency.
4860   // 2. If the loop is really small, then we interleave to reduce the loop
4861   // overhead.
4862   // 3. We don't interleave if we think that we will spill registers to memory
4863   // due to the increased register pressure.
4864 
4865   if (!isScalarEpilogueAllowed())
4866     return 1;
4867 
4868   // Do not interleave if EVL is preferred and no User IC is specified.
4869   if (foldTailWithEVL()) {
4870     LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4871                          "Unroll factor forced to be 1.\n");
4872     return 1;
4873   }
4874 
4875   // We used the distance for the interleave count.
4876   if (!Legal->isSafeForAnyVectorWidth())
4877     return 1;
4878 
4879   // We don't attempt to perform interleaving for loops with uncountable early
4880   // exits because the VPInstruction::AnyOf code cannot currently handle
4881   // multiple parts.
4882   if (Legal->hasUncountableEarlyExit())
4883     return 1;
4884 
4885   auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4886   const bool HasReductions = !Legal->getReductionVars().empty();
4887 
4888   // If we did not calculate the cost for VF (because the user selected the VF)
4889   // then we calculate the cost of VF here.
4890   if (LoopCost == 0) {
4891     LoopCost = expectedCost(VF);
4892     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4893 
4894     // Loop body is free and there is no need for interleaving.
4895     if (LoopCost == 0)
4896       return 1;
4897   }
4898 
4899   RegisterUsage R = calculateRegisterUsage({VF})[0];
4900   // We divide by these constants so assume that we have at least one
4901   // instruction that uses at least one register.
4902   for (auto &Pair : R.MaxLocalUsers) {
4903     Pair.second = std::max(Pair.second, 1U);
4904   }
4905 
4906   // We calculate the interleave count using the following formula.
4907   // Subtract the number of loop invariants from the number of available
4908   // registers. These registers are used by all of the interleaved instances.
4909   // Next, divide the remaining registers by the number of registers that is
4910   // required by the loop, in order to estimate how many parallel instances
4911   // fit without causing spills. All of this is rounded down if necessary to be
4912   // a power of two. We want power of two interleave count to simplify any
4913   // addressing operations or alignment considerations.
4914   // We also want power of two interleave counts to ensure that the induction
4915   // variable of the vector loop wraps to zero, when tail is folded by masking;
4916   // this currently happens when OptForSize, in which case IC is set to 1 above.
4917   unsigned IC = UINT_MAX;
4918 
4919   for (const auto &Pair : R.MaxLocalUsers) {
4920     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4921     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4922                       << " registers of "
4923                       << TTI.getRegisterClassName(Pair.first)
4924                       << " register class\n");
4925     if (VF.isScalar()) {
4926       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4927         TargetNumRegisters = ForceTargetNumScalarRegs;
4928     } else {
4929       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4930         TargetNumRegisters = ForceTargetNumVectorRegs;
4931     }
4932     unsigned MaxLocalUsers = Pair.second;
4933     unsigned LoopInvariantRegs = 0;
4934     if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
4935       LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4936 
4937     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4938                                      MaxLocalUsers);
4939     // Don't count the induction variable as interleaved.
4940     if (EnableIndVarRegisterHeur) {
4941       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4942                               std::max(1U, (MaxLocalUsers - 1)));
4943     }
4944 
4945     IC = std::min(IC, TmpIC);
4946   }
4947 
4948   // Clamp the interleave ranges to reasonable counts.
4949   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4950 
4951   // Check if the user has overridden the max.
4952   if (VF.isScalar()) {
4953     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4954       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4955   } else {
4956     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4957       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4958   }
4959 
4960   unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
4961   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4962   if (KnownTC > 0) {
4963     // At least one iteration must be scalar when this constraint holds. So the
4964     // maximum available iterations for interleaving is one less.
4965     unsigned AvailableTC =
4966         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4967 
4968     // If trip count is known we select between two prospective ICs, where
4969     // 1) the aggressive IC is capped by the trip count divided by VF
4970     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4971     // The final IC is selected in a way that the epilogue loop trip count is
4972     // minimized while maximizing the IC itself, so that we either run the
4973     // vector loop at least once if it generates a small epilogue loop, or else
4974     // we run the vector loop at least twice.
4975 
4976     unsigned InterleaveCountUB = bit_floor(
4977         std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4978     unsigned InterleaveCountLB = bit_floor(std::max(
4979         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4980     MaxInterleaveCount = InterleaveCountLB;
4981 
4982     if (InterleaveCountUB != InterleaveCountLB) {
4983       unsigned TailTripCountUB =
4984           (AvailableTC % (EstimatedVF * InterleaveCountUB));
4985       unsigned TailTripCountLB =
4986           (AvailableTC % (EstimatedVF * InterleaveCountLB));
4987       // If both produce same scalar tail, maximize the IC to do the same work
4988       // in fewer vector loop iterations
4989       if (TailTripCountUB == TailTripCountLB)
4990         MaxInterleaveCount = InterleaveCountUB;
4991     }
4992   } else if (BestKnownTC && *BestKnownTC > 0) {
4993     // At least one iteration must be scalar when this constraint holds. So the
4994     // maximum available iterations for interleaving is one less.
4995     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4996                                ? (*BestKnownTC) - 1
4997                                : *BestKnownTC;
4998 
4999     // If trip count is an estimated compile time constant, limit the
5000     // IC to be capped by the trip count divided by VF * 2, such that the vector
5001     // loop runs at least twice to make interleaving seem profitable when there
5002     // is an epilogue loop present. Since exact Trip count is not known we
5003     // choose to be conservative in our IC estimate.
5004     MaxInterleaveCount = bit_floor(std::max(
5005         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5006   }
5007 
5008   assert(MaxInterleaveCount > 0 &&
5009          "Maximum interleave count must be greater than 0");
5010 
5011   // Clamp the calculated IC to be between the 1 and the max interleave count
5012   // that the target and trip count allows.
5013   if (IC > MaxInterleaveCount)
5014     IC = MaxInterleaveCount;
5015   else
5016     // Make sure IC is greater than 0.
5017     IC = std::max(1u, IC);
5018 
5019   assert(IC > 0 && "Interleave count must be greater than 0.");
5020 
5021   // Interleave if we vectorized this loop and there is a reduction that could
5022   // benefit from interleaving.
5023   if (VF.isVector() && HasReductions) {
5024     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5025     return IC;
5026   }
5027 
5028   // For any scalar loop that either requires runtime checks or predication we
5029   // are better off leaving this to the unroller. Note that if we've already
5030   // vectorized the loop we will have done the runtime check and so interleaving
5031   // won't require further checks.
5032   bool ScalarInterleavingRequiresPredication =
5033       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5034          return Legal->blockNeedsPredication(BB);
5035        }));
5036   bool ScalarInterleavingRequiresRuntimePointerCheck =
5037       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5038 
5039   // We want to interleave small loops in order to reduce the loop overhead and
5040   // potentially expose ILP opportunities.
5041   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5042                     << "LV: IC is " << IC << '\n'
5043                     << "LV: VF is " << VF << '\n');
5044   const bool AggressivelyInterleaveReductions =
5045       TTI.enableAggressiveInterleaving(HasReductions);
5046   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5047       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5048     // We assume that the cost overhead is 1 and we use the cost model
5049     // to estimate the cost of the loop and interleave until the cost of the
5050     // loop overhead is about 5% of the cost of the loop.
5051     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5052                                         SmallLoopCost / *LoopCost.getValue()));
5053 
5054     // Interleave until store/load ports (estimated by max interleave count) are
5055     // saturated.
5056     unsigned NumStores = Legal->getNumStores();
5057     unsigned NumLoads = Legal->getNumLoads();
5058     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5059     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5060 
5061     // There is little point in interleaving for reductions containing selects
5062     // and compares when VF=1 since it may just create more overhead than it's
5063     // worth for loops with small trip counts. This is because we still have to
5064     // do the final reduction after the loop.
5065     bool HasSelectCmpReductions =
5066         HasReductions &&
5067         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5068           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5069           RecurKind RK = RdxDesc.getRecurrenceKind();
5070           return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5071                  RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5072         });
5073     if (HasSelectCmpReductions) {
5074       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5075       return 1;
5076     }
5077 
5078     // If we have a scalar reduction (vector reductions are already dealt with
5079     // by this point), we can increase the critical path length if the loop
5080     // we're interleaving is inside another loop. For tree-wise reductions
5081     // set the limit to 2, and for ordered reductions it's best to disable
5082     // interleaving entirely.
5083     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5084       bool HasOrderedReductions =
5085           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5086             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5087             return RdxDesc.isOrdered();
5088           });
5089       if (HasOrderedReductions) {
5090         LLVM_DEBUG(
5091             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5092         return 1;
5093       }
5094 
5095       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5096       SmallIC = std::min(SmallIC, F);
5097       StoresIC = std::min(StoresIC, F);
5098       LoadsIC = std::min(LoadsIC, F);
5099     }
5100 
5101     if (EnableLoadStoreRuntimeInterleave &&
5102         std::max(StoresIC, LoadsIC) > SmallIC) {
5103       LLVM_DEBUG(
5104           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5105       return std::max(StoresIC, LoadsIC);
5106     }
5107 
5108     // If there are scalar reductions and TTI has enabled aggressive
5109     // interleaving for reductions, we will interleave to expose ILP.
5110     if (VF.isScalar() && AggressivelyInterleaveReductions) {
5111       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5112       // Interleave no less than SmallIC but not as aggressive as the normal IC
5113       // to satisfy the rare situation when resources are too limited.
5114       return std::max(IC / 2, SmallIC);
5115     }
5116 
5117     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5118     return SmallIC;
5119   }
5120 
5121   // Interleave if this is a large loop (small loops are already dealt with by
5122   // this point) that could benefit from interleaving.
5123   if (AggressivelyInterleaveReductions) {
5124     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5125     return IC;
5126   }
5127 
5128   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5129   return 1;
5130 }
5131 
5132 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5133 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5134   // This function calculates the register usage by measuring the highest number
5135   // of values that are alive at a single location. Obviously, this is a very
5136   // rough estimation. We scan the loop in a topological order in order and
5137   // assign a number to each instruction. We use RPO to ensure that defs are
5138   // met before their users. We assume that each instruction that has in-loop
5139   // users starts an interval. We record every time that an in-loop value is
5140   // used, so we have a list of the first and last occurrences of each
5141   // instruction. Next, we transpose this data structure into a multi map that
5142   // holds the list of intervals that *end* at a specific location. This multi
5143   // map allows us to perform a linear search. We scan the instructions linearly
5144   // and record each time that a new interval starts, by placing it in a set.
5145   // If we find this value in the multi-map then we remove it from the set.
5146   // The max register usage is the maximum size of the set.
5147   // We also search for instructions that are defined outside the loop, but are
5148   // used inside the loop. We need this number separately from the max-interval
5149   // usage number because when we unroll, loop-invariant values do not take
5150   // more register.
5151   LoopBlocksDFS DFS(TheLoop);
5152   DFS.perform(LI);
5153 
5154   RegisterUsage RU;
5155 
5156   // Each 'key' in the map opens a new interval. The values
5157   // of the map are the index of the 'last seen' usage of the
5158   // instruction that is the key.
5159   using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5160 
5161   // Maps instruction to its index.
5162   SmallVector<Instruction *, 64> IdxToInstr;
5163   // Marks the end of each interval.
5164   IntervalMap EndPoint;
5165   // Saves the list of instruction indices that are used in the loop.
5166   SmallPtrSet<Instruction *, 8> Ends;
5167   // Saves the list of values that are used in the loop but are defined outside
5168   // the loop (not including non-instruction values such as arguments and
5169   // constants).
5170   SmallSetVector<Instruction *, 8> LoopInvariants;
5171 
5172   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5173     for (Instruction &I : BB->instructionsWithoutDebug()) {
5174       IdxToInstr.push_back(&I);
5175 
5176       // Save the end location of each USE.
5177       for (Value *U : I.operands()) {
5178         auto *Instr = dyn_cast<Instruction>(U);
5179 
5180         // Ignore non-instruction values such as arguments, constants, etc.
5181         // FIXME: Might need some motivation why these values are ignored. If
5182         // for example an argument is used inside the loop it will increase the
5183         // register pressure (so shouldn't we add it to LoopInvariants).
5184         if (!Instr)
5185           continue;
5186 
5187         // If this instruction is outside the loop then record it and continue.
5188         if (!TheLoop->contains(Instr)) {
5189           LoopInvariants.insert(Instr);
5190           continue;
5191         }
5192 
5193         // Overwrite previous end points.
5194         EndPoint[Instr] = IdxToInstr.size();
5195         Ends.insert(Instr);
5196       }
5197     }
5198   }
5199 
5200   // Saves the list of intervals that end with the index in 'key'.
5201   using InstrList = SmallVector<Instruction *, 2>;
5202   SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5203 
5204   // Transpose the EndPoints to a list of values that end at each index.
5205   for (auto &Interval : EndPoint)
5206     TransposeEnds[Interval.second].push_back(Interval.first);
5207 
5208   SmallPtrSet<Instruction *, 8> OpenIntervals;
5209   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5210   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5211 
5212   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5213 
5214   const auto &TTICapture = TTI;
5215   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5216     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5217         (VF.isScalable() &&
5218          !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5219       return 0;
5220     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5221   };
5222 
5223   for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5224     Instruction *I = IdxToInstr[Idx];
5225 
5226     // Remove all of the instructions that end at this location.
5227     InstrList &List = TransposeEnds[Idx];
5228     for (Instruction *ToRemove : List)
5229       OpenIntervals.erase(ToRemove);
5230 
5231     // Ignore instructions that are never used within the loop.
5232     if (!Ends.count(I))
5233       continue;
5234 
5235     // Skip ignored values.
5236     if (ValuesToIgnore.count(I))
5237       continue;
5238 
5239     collectInLoopReductions();
5240 
5241     // For each VF find the maximum usage of registers.
5242     for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5243       // Count the number of registers used, per register class, given all open
5244       // intervals.
5245       // Note that elements in this SmallMapVector will be default constructed
5246       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5247       // there is no previous entry for ClassID.
5248       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5249 
5250       if (VFs[J].isScalar()) {
5251         for (auto *Inst : OpenIntervals) {
5252           unsigned ClassID =
5253               TTI.getRegisterClassForType(false, Inst->getType());
5254           // FIXME: The target might use more than one register for the type
5255           // even in the scalar case.
5256           RegUsage[ClassID] += 1;
5257         }
5258       } else {
5259         collectUniformsAndScalars(VFs[J]);
5260         for (auto *Inst : OpenIntervals) {
5261           // Skip ignored values for VF > 1.
5262           if (VecValuesToIgnore.count(Inst))
5263             continue;
5264           if (isScalarAfterVectorization(Inst, VFs[J])) {
5265             unsigned ClassID =
5266                 TTI.getRegisterClassForType(false, Inst->getType());
5267             // FIXME: The target might use more than one register for the type
5268             // even in the scalar case.
5269             RegUsage[ClassID] += 1;
5270           } else {
5271             unsigned ClassID =
5272                 TTI.getRegisterClassForType(true, Inst->getType());
5273             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5274           }
5275         }
5276       }
5277 
5278       for (const auto &Pair : RegUsage) {
5279         auto &Entry = MaxUsages[J][Pair.first];
5280         Entry = std::max(Entry, Pair.second);
5281       }
5282     }
5283 
5284     LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5285                       << OpenIntervals.size() << '\n');
5286 
5287     // Add the current instruction to the list of open intervals.
5288     OpenIntervals.insert(I);
5289   }
5290 
5291   for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5292     // Note that elements in this SmallMapVector will be default constructed
5293     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5294     // there is no previous entry for ClassID.
5295     SmallMapVector<unsigned, unsigned, 4> Invariant;
5296 
5297     for (auto *Inst : LoopInvariants) {
5298       // FIXME: The target might use more than one register for the type
5299       // even in the scalar case.
5300       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5301         auto *I = cast<Instruction>(U);
5302         return TheLoop != LI->getLoopFor(I->getParent()) ||
5303                isScalarAfterVectorization(I, VFs[Idx]);
5304       });
5305 
5306       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5307       unsigned ClassID =
5308           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5309       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5310     }
5311 
5312     LLVM_DEBUG({
5313       dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5314       dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5315              << " item\n";
5316       for (const auto &pair : MaxUsages[Idx]) {
5317         dbgs() << "LV(REG): RegisterClass: "
5318                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5319                << " registers\n";
5320       }
5321       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5322              << " item\n";
5323       for (const auto &pair : Invariant) {
5324         dbgs() << "LV(REG): RegisterClass: "
5325                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5326                << " registers\n";
5327       }
5328     });
5329 
5330     RU.LoopInvariantRegs = Invariant;
5331     RU.MaxLocalUsers = MaxUsages[Idx];
5332     RUs[Idx] = RU;
5333   }
5334 
5335   return RUs;
5336 }
5337 
5338 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5339                                                            ElementCount VF) {
5340   // TODO: Cost model for emulated masked load/store is completely
5341   // broken. This hack guides the cost model to use an artificially
5342   // high enough value to practically disable vectorization with such
5343   // operations, except where previously deployed legality hack allowed
5344   // using very low cost values. This is to avoid regressions coming simply
5345   // from moving "masked load/store" check from legality to cost model.
5346   // Masked Load/Gather emulation was previously never allowed.
5347   // Limited number of Masked Store/Scatter emulation was allowed.
5348   assert((isPredicatedInst(I)) &&
5349          "Expecting a scalar emulated instruction");
5350   return isa<LoadInst>(I) ||
5351          (isa<StoreInst>(I) &&
5352           NumPredStores > NumberOfStoresToPredicate);
5353 }
5354 
5355 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5356   // If we aren't vectorizing the loop, or if we've already collected the
5357   // instructions to scalarize, there's nothing to do. Collection may already
5358   // have occurred if we have a user-selected VF and are now computing the
5359   // expected cost for interleaving.
5360   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5361     return;
5362 
5363   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5364   // not profitable to scalarize any instructions, the presence of VF in the
5365   // map will indicate that we've analyzed it already.
5366   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5367 
5368   PredicatedBBsAfterVectorization[VF].clear();
5369 
5370   // Find all the instructions that are scalar with predication in the loop and
5371   // determine if it would be better to not if-convert the blocks they are in.
5372   // If so, we also record the instructions to scalarize.
5373   for (BasicBlock *BB : TheLoop->blocks()) {
5374     if (!blockNeedsPredicationForAnyReason(BB))
5375       continue;
5376     for (Instruction &I : *BB)
5377       if (isScalarWithPredication(&I, VF)) {
5378         ScalarCostsTy ScalarCosts;
5379         // Do not apply discount logic for:
5380         // 1. Scalars after vectorization, as there will only be a single copy
5381         // of the instruction.
5382         // 2. Scalable VF, as that would lead to invalid scalarization costs.
5383         // 3. Emulated masked memrefs, if a hacked cost is needed.
5384         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5385             !useEmulatedMaskMemRefHack(&I, VF) &&
5386             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5387           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5388           // Check if we decided to scalarize a call. If so, update the widening
5389           // decision of the call to CM_Scalarize with the computed scalar cost.
5390           for (const auto &[I, _] : ScalarCosts) {
5391             auto *CI = dyn_cast<CallInst>(I);
5392             if (!CI || !CallWideningDecisions.contains({CI, VF}))
5393               continue;
5394             CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5395             CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5396           }
5397         }
5398         // Remember that BB will remain after vectorization.
5399         PredicatedBBsAfterVectorization[VF].insert(BB);
5400         for (auto *Pred : predecessors(BB)) {
5401           if (Pred->getSingleSuccessor() == BB)
5402             PredicatedBBsAfterVectorization[VF].insert(Pred);
5403         }
5404       }
5405   }
5406 }
5407 
5408 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5409     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5410   assert(!isUniformAfterVectorization(PredInst, VF) &&
5411          "Instruction marked uniform-after-vectorization will be predicated");
5412 
5413   // Initialize the discount to zero, meaning that the scalar version and the
5414   // vector version cost the same.
5415   InstructionCost Discount = 0;
5416 
5417   // Holds instructions to analyze. The instructions we visit are mapped in
5418   // ScalarCosts. Those instructions are the ones that would be scalarized if
5419   // we find that the scalar version costs less.
5420   SmallVector<Instruction *, 8> Worklist;
5421 
5422   // Returns true if the given instruction can be scalarized.
5423   auto CanBeScalarized = [&](Instruction *I) -> bool {
5424     // We only attempt to scalarize instructions forming a single-use chain
5425     // from the original predicated block that would otherwise be vectorized.
5426     // Although not strictly necessary, we give up on instructions we know will
5427     // already be scalar to avoid traversing chains that are unlikely to be
5428     // beneficial.
5429     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5430         isScalarAfterVectorization(I, VF))
5431       return false;
5432 
5433     // If the instruction is scalar with predication, it will be analyzed
5434     // separately. We ignore it within the context of PredInst.
5435     if (isScalarWithPredication(I, VF))
5436       return false;
5437 
5438     // If any of the instruction's operands are uniform after vectorization,
5439     // the instruction cannot be scalarized. This prevents, for example, a
5440     // masked load from being scalarized.
5441     //
5442     // We assume we will only emit a value for lane zero of an instruction
5443     // marked uniform after vectorization, rather than VF identical values.
5444     // Thus, if we scalarize an instruction that uses a uniform, we would
5445     // create uses of values corresponding to the lanes we aren't emitting code
5446     // for. This behavior can be changed by allowing getScalarValue to clone
5447     // the lane zero values for uniforms rather than asserting.
5448     for (Use &U : I->operands())
5449       if (auto *J = dyn_cast<Instruction>(U.get()))
5450         if (isUniformAfterVectorization(J, VF))
5451           return false;
5452 
5453     // Otherwise, we can scalarize the instruction.
5454     return true;
5455   };
5456 
5457   // Compute the expected cost discount from scalarizing the entire expression
5458   // feeding the predicated instruction. We currently only consider expressions
5459   // that are single-use instruction chains.
5460   Worklist.push_back(PredInst);
5461   while (!Worklist.empty()) {
5462     Instruction *I = Worklist.pop_back_val();
5463 
5464     // If we've already analyzed the instruction, there's nothing to do.
5465     if (ScalarCosts.contains(I))
5466       continue;
5467 
5468     // Compute the cost of the vector instruction. Note that this cost already
5469     // includes the scalarization overhead of the predicated instruction.
5470     InstructionCost VectorCost = getInstructionCost(I, VF);
5471 
5472     // Compute the cost of the scalarized instruction. This cost is the cost of
5473     // the instruction as if it wasn't if-converted and instead remained in the
5474     // predicated block. We will scale this cost by block probability after
5475     // computing the scalarization overhead.
5476     InstructionCost ScalarCost =
5477         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5478 
5479     // Compute the scalarization overhead of needed insertelement instructions
5480     // and phi nodes.
5481     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5482       ScalarCost += TTI.getScalarizationOverhead(
5483           cast<VectorType>(toVectorTy(I->getType(), VF)),
5484           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5485           /*Extract*/ false, CostKind);
5486       ScalarCost +=
5487           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5488     }
5489 
5490     // Compute the scalarization overhead of needed extractelement
5491     // instructions. For each of the instruction's operands, if the operand can
5492     // be scalarized, add it to the worklist; otherwise, account for the
5493     // overhead.
5494     for (Use &U : I->operands())
5495       if (auto *J = dyn_cast<Instruction>(U.get())) {
5496         assert(VectorType::isValidElementType(J->getType()) &&
5497                "Instruction has non-scalar type");
5498         if (CanBeScalarized(J))
5499           Worklist.push_back(J);
5500         else if (needsExtract(J, VF)) {
5501           ScalarCost += TTI.getScalarizationOverhead(
5502               cast<VectorType>(toVectorTy(J->getType(), VF)),
5503               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5504               /*Extract*/ true, CostKind);
5505         }
5506       }
5507 
5508     // Scale the total scalar cost by block probability.
5509     ScalarCost /= getReciprocalPredBlockProb();
5510 
5511     // Compute the discount. A non-negative discount means the vector version
5512     // of the instruction costs more, and scalarizing would be beneficial.
5513     Discount += VectorCost - ScalarCost;
5514     ScalarCosts[I] = ScalarCost;
5515   }
5516 
5517   return Discount;
5518 }
5519 
5520 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5521   InstructionCost Cost;
5522 
5523   // If the vector loop gets executed exactly once with the given VF, ignore the
5524   // costs of comparison and induction instructions, as they'll get simplified
5525   // away.
5526   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5527   auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5528   if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5529     addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5530                                          ValuesToIgnoreForVF);
5531 
5532   // For each block.
5533   for (BasicBlock *BB : TheLoop->blocks()) {
5534     InstructionCost BlockCost;
5535 
5536     // For each instruction in the old loop.
5537     for (Instruction &I : BB->instructionsWithoutDebug()) {
5538       // Skip ignored values.
5539       if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5540           (VF.isVector() && VecValuesToIgnore.count(&I)))
5541         continue;
5542 
5543       InstructionCost C = getInstructionCost(&I, VF);
5544 
5545       // Check if we should override the cost.
5546       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5547         C = InstructionCost(ForceTargetInstructionCost);
5548 
5549       BlockCost += C;
5550       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5551                         << VF << " For instruction: " << I << '\n');
5552     }
5553 
5554     // If we are vectorizing a predicated block, it will have been
5555     // if-converted. This means that the block's instructions (aside from
5556     // stores and instructions that may divide by zero) will now be
5557     // unconditionally executed. For the scalar case, we may not always execute
5558     // the predicated block, if it is an if-else block. Thus, scale the block's
5559     // cost by the probability of executing it. blockNeedsPredication from
5560     // Legal is used so as to not include all blocks in tail folded loops.
5561     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5562       BlockCost /= getReciprocalPredBlockProb();
5563 
5564     Cost += BlockCost;
5565   }
5566 
5567   return Cost;
5568 }
5569 
5570 /// Gets Address Access SCEV after verifying that the access pattern
5571 /// is loop invariant except the induction variable dependence.
5572 ///
5573 /// This SCEV can be sent to the Target in order to estimate the address
5574 /// calculation cost.
5575 static const SCEV *getAddressAccessSCEV(
5576               Value *Ptr,
5577               LoopVectorizationLegality *Legal,
5578               PredicatedScalarEvolution &PSE,
5579               const Loop *TheLoop) {
5580 
5581   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5582   if (!Gep)
5583     return nullptr;
5584 
5585   // We are looking for a gep with all loop invariant indices except for one
5586   // which should be an induction variable.
5587   auto *SE = PSE.getSE();
5588   unsigned NumOperands = Gep->getNumOperands();
5589   for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5590     Value *Opd = Gep->getOperand(Idx);
5591     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5592         !Legal->isInductionVariable(Opd))
5593       return nullptr;
5594   }
5595 
5596   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5597   return PSE.getSCEV(Ptr);
5598 }
5599 
5600 InstructionCost
5601 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5602                                                         ElementCount VF) {
5603   assert(VF.isVector() &&
5604          "Scalarization cost of instruction implies vectorization.");
5605   if (VF.isScalable())
5606     return InstructionCost::getInvalid();
5607 
5608   Type *ValTy = getLoadStoreType(I);
5609   auto *SE = PSE.getSE();
5610 
5611   unsigned AS = getLoadStoreAddressSpace(I);
5612   Value *Ptr = getLoadStorePointerOperand(I);
5613   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5614   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5615   //       that it is being called from this specific place.
5616 
5617   // Figure out whether the access is strided and get the stride value
5618   // if it's known in compile time
5619   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5620 
5621   // Get the cost of the scalar memory instruction and address computation.
5622   InstructionCost Cost =
5623       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5624 
5625   // Don't pass *I here, since it is scalar but will actually be part of a
5626   // vectorized loop where the user of it is a vectorized instruction.
5627   const Align Alignment = getLoadStoreAlignment(I);
5628   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5629                                                       ValTy->getScalarType(),
5630                                                       Alignment, AS, CostKind);
5631 
5632   // Get the overhead of the extractelement and insertelement instructions
5633   // we might create due to scalarization.
5634   Cost += getScalarizationOverhead(I, VF);
5635 
5636   // If we have a predicated load/store, it will need extra i1 extracts and
5637   // conditional branches, but may not be executed for each vector lane. Scale
5638   // the cost by the probability of executing the predicated block.
5639   if (isPredicatedInst(I)) {
5640     Cost /= getReciprocalPredBlockProb();
5641 
5642     // Add the cost of an i1 extract and a branch
5643     auto *VecI1Ty =
5644         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5645     Cost += TTI.getScalarizationOverhead(
5646         VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5647         /*Insert=*/false, /*Extract=*/true, CostKind);
5648     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5649 
5650     if (useEmulatedMaskMemRefHack(I, VF))
5651       // Artificially setting to a high enough value to practically disable
5652       // vectorization with such operations.
5653       Cost = 3000000;
5654   }
5655 
5656   return Cost;
5657 }
5658 
5659 InstructionCost
5660 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5661                                                     ElementCount VF) {
5662   Type *ValTy = getLoadStoreType(I);
5663   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5664   Value *Ptr = getLoadStorePointerOperand(I);
5665   unsigned AS = getLoadStoreAddressSpace(I);
5666   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5667 
5668   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5669          "Stride should be 1 or -1 for consecutive memory access");
5670   const Align Alignment = getLoadStoreAlignment(I);
5671   InstructionCost Cost = 0;
5672   if (Legal->isMaskRequired(I)) {
5673     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5674                                       CostKind);
5675   } else {
5676     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5677     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5678                                 CostKind, OpInfo, I);
5679   }
5680 
5681   bool Reverse = ConsecutiveStride < 0;
5682   if (Reverse)
5683     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5684                                CostKind, 0);
5685   return Cost;
5686 }
5687 
5688 InstructionCost
5689 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5690                                                 ElementCount VF) {
5691   assert(Legal->isUniformMemOp(*I, VF));
5692 
5693   Type *ValTy = getLoadStoreType(I);
5694   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5695   const Align Alignment = getLoadStoreAlignment(I);
5696   unsigned AS = getLoadStoreAddressSpace(I);
5697   if (isa<LoadInst>(I)) {
5698     return TTI.getAddressComputationCost(ValTy) +
5699            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5700                                CostKind) +
5701            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
5702                               CostKind);
5703   }
5704   StoreInst *SI = cast<StoreInst>(I);
5705 
5706   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5707   return TTI.getAddressComputationCost(ValTy) +
5708          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5709                              CostKind) +
5710          (IsLoopInvariantStoreValue
5711               ? 0
5712               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5713                                        CostKind, VF.getKnownMinValue() - 1));
5714 }
5715 
5716 InstructionCost
5717 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5718                                                  ElementCount VF) {
5719   Type *ValTy = getLoadStoreType(I);
5720   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5721   const Align Alignment = getLoadStoreAlignment(I);
5722   const Value *Ptr = getLoadStorePointerOperand(I);
5723 
5724   return TTI.getAddressComputationCost(VectorTy) +
5725          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5726                                     Legal->isMaskRequired(I), Alignment,
5727                                     CostKind, I);
5728 }
5729 
5730 InstructionCost
5731 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5732                                                    ElementCount VF) {
5733   const auto *Group = getInterleavedAccessGroup(I);
5734   assert(Group && "Fail to get an interleaved access group.");
5735 
5736   Instruction *InsertPos = Group->getInsertPos();
5737   Type *ValTy = getLoadStoreType(InsertPos);
5738   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5739   unsigned AS = getLoadStoreAddressSpace(InsertPos);
5740 
5741   unsigned InterleaveFactor = Group->getFactor();
5742   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5743 
5744   // Holds the indices of existing members in the interleaved group.
5745   SmallVector<unsigned, 4> Indices;
5746   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5747     if (Group->getMember(IF))
5748       Indices.push_back(IF);
5749 
5750   // Calculate the cost of the whole interleaved group.
5751   bool UseMaskForGaps =
5752       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5753       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5754   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5755       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5756       Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5757       UseMaskForGaps);
5758 
5759   if (Group->isReverse()) {
5760     // TODO: Add support for reversed masked interleaved access.
5761     assert(!Legal->isMaskRequired(I) &&
5762            "Reverse masked interleaved access not supported.");
5763     Cost += Group->getNumMembers() *
5764             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5765                                CostKind, 0);
5766   }
5767   return Cost;
5768 }
5769 
5770 std::optional<InstructionCost>
5771 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5772                                                     ElementCount VF,
5773                                                     Type *Ty) const {
5774   using namespace llvm::PatternMatch;
5775   // Early exit for no inloop reductions
5776   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5777     return std::nullopt;
5778   auto *VectorTy = cast<VectorType>(Ty);
5779 
5780   // We are looking for a pattern of, and finding the minimal acceptable cost:
5781   //  reduce(mul(ext(A), ext(B))) or
5782   //  reduce(mul(A, B)) or
5783   //  reduce(ext(A)) or
5784   //  reduce(A).
5785   // The basic idea is that we walk down the tree to do that, finding the root
5786   // reduction instruction in InLoopReductionImmediateChains. From there we find
5787   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5788   // of the components. If the reduction cost is lower then we return it for the
5789   // reduction instruction and 0 for the other instructions in the pattern. If
5790   // it is not we return an invalid cost specifying the orignal cost method
5791   // should be used.
5792   Instruction *RetI = I;
5793   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5794     if (!RetI->hasOneUser())
5795       return std::nullopt;
5796     RetI = RetI->user_back();
5797   }
5798 
5799   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5800       RetI->user_back()->getOpcode() == Instruction::Add) {
5801     RetI = RetI->user_back();
5802   }
5803 
5804   // Test if the found instruction is a reduction, and if not return an invalid
5805   // cost specifying the parent to use the original cost modelling.
5806   if (!InLoopReductionImmediateChains.count(RetI))
5807     return std::nullopt;
5808 
5809   // Find the reduction this chain is a part of and calculate the basic cost of
5810   // the reduction on its own.
5811   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5812   Instruction *ReductionPhi = LastChain;
5813   while (!isa<PHINode>(ReductionPhi))
5814     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5815 
5816   const RecurrenceDescriptor &RdxDesc =
5817       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5818 
5819   InstructionCost BaseCost;
5820   RecurKind RK = RdxDesc.getRecurrenceKind();
5821   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5822     Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5823     BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5824                                           RdxDesc.getFastMathFlags(), CostKind);
5825   } else {
5826     BaseCost = TTI.getArithmeticReductionCost(
5827         RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5828   }
5829 
5830   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5831   // normal fmul instruction to the cost of the fadd reduction.
5832   if (RK == RecurKind::FMulAdd)
5833     BaseCost +=
5834         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5835 
5836   // If we're using ordered reductions then we can just return the base cost
5837   // here, since getArithmeticReductionCost calculates the full ordered
5838   // reduction cost when FP reassociation is not allowed.
5839   if (useOrderedReductions(RdxDesc))
5840     return BaseCost;
5841 
5842   // Get the operand that was not the reduction chain and match it to one of the
5843   // patterns, returning the better cost if it is found.
5844   Instruction *RedOp = RetI->getOperand(1) == LastChain
5845                            ? dyn_cast<Instruction>(RetI->getOperand(0))
5846                            : dyn_cast<Instruction>(RetI->getOperand(1));
5847 
5848   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5849 
5850   Instruction *Op0, *Op1;
5851   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5852       match(RedOp,
5853             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5854       match(Op0, m_ZExtOrSExt(m_Value())) &&
5855       Op0->getOpcode() == Op1->getOpcode() &&
5856       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5857       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5858       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5859 
5860     // Matched reduce.add(ext(mul(ext(A), ext(B)))
5861     // Note that the extend opcodes need to all match, or if A==B they will have
5862     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5863     // which is equally fine.
5864     bool IsUnsigned = isa<ZExtInst>(Op0);
5865     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5866     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5867 
5868     InstructionCost ExtCost =
5869         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5870                              TTI::CastContextHint::None, CostKind, Op0);
5871     InstructionCost MulCost =
5872         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5873     InstructionCost Ext2Cost =
5874         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5875                              TTI::CastContextHint::None, CostKind, RedOp);
5876 
5877     InstructionCost RedCost = TTI.getMulAccReductionCost(
5878         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5879 
5880     if (RedCost.isValid() &&
5881         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5882       return I == RetI ? RedCost : 0;
5883   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5884              !TheLoop->isLoopInvariant(RedOp)) {
5885     // Matched reduce(ext(A))
5886     bool IsUnsigned = isa<ZExtInst>(RedOp);
5887     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5888     InstructionCost RedCost = TTI.getExtendedReductionCost(
5889         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5890         RdxDesc.getFastMathFlags(), CostKind);
5891 
5892     InstructionCost ExtCost =
5893         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5894                              TTI::CastContextHint::None, CostKind, RedOp);
5895     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5896       return I == RetI ? RedCost : 0;
5897   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5898              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5899     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5900         Op0->getOpcode() == Op1->getOpcode() &&
5901         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5902       bool IsUnsigned = isa<ZExtInst>(Op0);
5903       Type *Op0Ty = Op0->getOperand(0)->getType();
5904       Type *Op1Ty = Op1->getOperand(0)->getType();
5905       Type *LargestOpTy =
5906           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5907                                                                     : Op0Ty;
5908       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5909 
5910       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5911       // different sizes. We take the largest type as the ext to reduce, and add
5912       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5913       InstructionCost ExtCost0 = TTI.getCastInstrCost(
5914           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5915           TTI::CastContextHint::None, CostKind, Op0);
5916       InstructionCost ExtCost1 = TTI.getCastInstrCost(
5917           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5918           TTI::CastContextHint::None, CostKind, Op1);
5919       InstructionCost MulCost =
5920           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5921 
5922       InstructionCost RedCost = TTI.getMulAccReductionCost(
5923           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5924       InstructionCost ExtraExtCost = 0;
5925       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5926         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5927         ExtraExtCost = TTI.getCastInstrCost(
5928             ExtraExtOp->getOpcode(), ExtType,
5929             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5930             TTI::CastContextHint::None, CostKind, ExtraExtOp);
5931       }
5932 
5933       if (RedCost.isValid() &&
5934           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5935         return I == RetI ? RedCost : 0;
5936     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5937       // Matched reduce.add(mul())
5938       InstructionCost MulCost =
5939           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5940 
5941       InstructionCost RedCost = TTI.getMulAccReductionCost(
5942           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5943 
5944       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5945         return I == RetI ? RedCost : 0;
5946     }
5947   }
5948 
5949   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5950 }
5951 
5952 InstructionCost
5953 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5954                                                      ElementCount VF) {
5955   // Calculate scalar cost only. Vectorization cost should be ready at this
5956   // moment.
5957   if (VF.isScalar()) {
5958     Type *ValTy = getLoadStoreType(I);
5959     const Align Alignment = getLoadStoreAlignment(I);
5960     unsigned AS = getLoadStoreAddressSpace(I);
5961 
5962     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5963     return TTI.getAddressComputationCost(ValTy) +
5964            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5965                                OpInfo, I);
5966   }
5967   return getWideningCost(I, VF);
5968 }
5969 
5970 InstructionCost
5971 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5972                                                      ElementCount VF) const {
5973 
5974   // There is no mechanism yet to create a scalable scalarization loop,
5975   // so this is currently Invalid.
5976   if (VF.isScalable())
5977     return InstructionCost::getInvalid();
5978 
5979   if (VF.isScalar())
5980     return 0;
5981 
5982   InstructionCost Cost = 0;
5983   Type *RetTy = toVectorTy(I->getType(), VF);
5984   if (!RetTy->isVoidTy() &&
5985       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5986     Cost += TTI.getScalarizationOverhead(
5987         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
5988         /*Insert*/ true,
5989         /*Extract*/ false, CostKind);
5990 
5991   // Some targets keep addresses scalar.
5992   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5993     return Cost;
5994 
5995   // Some targets support efficient element stores.
5996   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5997     return Cost;
5998 
5999   // Collect operands to consider.
6000   CallInst *CI = dyn_cast<CallInst>(I);
6001   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6002 
6003   // Skip operands that do not require extraction/scalarization and do not incur
6004   // any overhead.
6005   SmallVector<Type *> Tys;
6006   for (auto *V : filterExtractingOperands(Ops, VF))
6007     Tys.push_back(maybeVectorizeType(V->getType(), VF));
6008   return Cost + TTI.getOperandsScalarizationOverhead(
6009                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6010 }
6011 
6012 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6013   if (VF.isScalar())
6014     return;
6015   NumPredStores = 0;
6016   for (BasicBlock *BB : TheLoop->blocks()) {
6017     // For each instruction in the old loop.
6018     for (Instruction &I : *BB) {
6019       Value *Ptr =  getLoadStorePointerOperand(&I);
6020       if (!Ptr)
6021         continue;
6022 
6023       // TODO: We should generate better code and update the cost model for
6024       // predicated uniform stores. Today they are treated as any other
6025       // predicated store (see added test cases in
6026       // invariant-store-vectorization.ll).
6027       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6028         NumPredStores++;
6029 
6030       if (Legal->isUniformMemOp(I, VF)) {
6031         auto IsLegalToScalarize = [&]() {
6032           if (!VF.isScalable())
6033             // Scalarization of fixed length vectors "just works".
6034             return true;
6035 
6036           // We have dedicated lowering for unpredicated uniform loads and
6037           // stores.  Note that even with tail folding we know that at least
6038           // one lane is active (i.e. generalized predication is not possible
6039           // here), and the logic below depends on this fact.
6040           if (!foldTailByMasking())
6041             return true;
6042 
6043           // For scalable vectors, a uniform memop load is always
6044           // uniform-by-parts  and we know how to scalarize that.
6045           if (isa<LoadInst>(I))
6046             return true;
6047 
6048           // A uniform store isn't neccessarily uniform-by-part
6049           // and we can't assume scalarization.
6050           auto &SI = cast<StoreInst>(I);
6051           return TheLoop->isLoopInvariant(SI.getValueOperand());
6052         };
6053 
6054         const InstructionCost GatherScatterCost =
6055           isLegalGatherOrScatter(&I, VF) ?
6056           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6057 
6058         // Load: Scalar load + broadcast
6059         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6060         // FIXME: This cost is a significant under-estimate for tail folded
6061         // memory ops.
6062         const InstructionCost ScalarizationCost =
6063             IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6064                                  : InstructionCost::getInvalid();
6065 
6066         // Choose better solution for the current VF,  Note that Invalid
6067         // costs compare as maximumal large.  If both are invalid, we get
6068         // scalable invalid which signals a failure and a vectorization abort.
6069         if (GatherScatterCost < ScalarizationCost)
6070           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6071         else
6072           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6073         continue;
6074       }
6075 
6076       // We assume that widening is the best solution when possible.
6077       if (memoryInstructionCanBeWidened(&I, VF)) {
6078         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6079         int ConsecutiveStride = Legal->isConsecutivePtr(
6080             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6081         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6082                "Expected consecutive stride.");
6083         InstWidening Decision =
6084             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6085         setWideningDecision(&I, VF, Decision, Cost);
6086         continue;
6087       }
6088 
6089       // Choose between Interleaving, Gather/Scatter or Scalarization.
6090       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6091       unsigned NumAccesses = 1;
6092       if (isAccessInterleaved(&I)) {
6093         const auto *Group = getInterleavedAccessGroup(&I);
6094         assert(Group && "Fail to get an interleaved access group.");
6095 
6096         // Make one decision for the whole group.
6097         if (getWideningDecision(&I, VF) != CM_Unknown)
6098           continue;
6099 
6100         NumAccesses = Group->getNumMembers();
6101         if (interleavedAccessCanBeWidened(&I, VF))
6102           InterleaveCost = getInterleaveGroupCost(&I, VF);
6103       }
6104 
6105       InstructionCost GatherScatterCost =
6106           isLegalGatherOrScatter(&I, VF)
6107               ? getGatherScatterCost(&I, VF) * NumAccesses
6108               : InstructionCost::getInvalid();
6109 
6110       InstructionCost ScalarizationCost =
6111           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6112 
6113       // Choose better solution for the current VF,
6114       // write down this decision and use it during vectorization.
6115       InstructionCost Cost;
6116       InstWidening Decision;
6117       if (InterleaveCost <= GatherScatterCost &&
6118           InterleaveCost < ScalarizationCost) {
6119         Decision = CM_Interleave;
6120         Cost = InterleaveCost;
6121       } else if (GatherScatterCost < ScalarizationCost) {
6122         Decision = CM_GatherScatter;
6123         Cost = GatherScatterCost;
6124       } else {
6125         Decision = CM_Scalarize;
6126         Cost = ScalarizationCost;
6127       }
6128       // If the instructions belongs to an interleave group, the whole group
6129       // receives the same decision. The whole group receives the cost, but
6130       // the cost will actually be assigned to one instruction.
6131       if (const auto *Group = getInterleavedAccessGroup(&I))
6132         setWideningDecision(Group, VF, Decision, Cost);
6133       else
6134         setWideningDecision(&I, VF, Decision, Cost);
6135     }
6136   }
6137 
6138   // Make sure that any load of address and any other address computation
6139   // remains scalar unless there is gather/scatter support. This avoids
6140   // inevitable extracts into address registers, and also has the benefit of
6141   // activating LSR more, since that pass can't optimize vectorized
6142   // addresses.
6143   if (TTI.prefersVectorizedAddressing())
6144     return;
6145 
6146   // Start with all scalar pointer uses.
6147   SmallPtrSet<Instruction *, 8> AddrDefs;
6148   for (BasicBlock *BB : TheLoop->blocks())
6149     for (Instruction &I : *BB) {
6150       Instruction *PtrDef =
6151         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6152       if (PtrDef && TheLoop->contains(PtrDef) &&
6153           getWideningDecision(&I, VF) != CM_GatherScatter)
6154         AddrDefs.insert(PtrDef);
6155     }
6156 
6157   // Add all instructions used to generate the addresses.
6158   SmallVector<Instruction *, 4> Worklist;
6159   append_range(Worklist, AddrDefs);
6160   while (!Worklist.empty()) {
6161     Instruction *I = Worklist.pop_back_val();
6162     for (auto &Op : I->operands())
6163       if (auto *InstOp = dyn_cast<Instruction>(Op))
6164         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6165             AddrDefs.insert(InstOp).second)
6166           Worklist.push_back(InstOp);
6167   }
6168 
6169   for (auto *I : AddrDefs) {
6170     if (isa<LoadInst>(I)) {
6171       // Setting the desired widening decision should ideally be handled in
6172       // by cost functions, but since this involves the task of finding out
6173       // if the loaded register is involved in an address computation, it is
6174       // instead changed here when we know this is the case.
6175       InstWidening Decision = getWideningDecision(I, VF);
6176       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6177         // Scalarize a widened load of address.
6178         setWideningDecision(
6179             I, VF, CM_Scalarize,
6180             (VF.getKnownMinValue() *
6181              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6182       else if (const auto *Group = getInterleavedAccessGroup(I)) {
6183         // Scalarize an interleave group of address loads.
6184         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6185           if (Instruction *Member = Group->getMember(I))
6186             setWideningDecision(
6187                 Member, VF, CM_Scalarize,
6188                 (VF.getKnownMinValue() *
6189                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6190         }
6191       }
6192     } else
6193       // Make sure I gets scalarized and a cost estimate without
6194       // scalarization overhead.
6195       ForcedScalars[VF].insert(I);
6196   }
6197 }
6198 
6199 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6200   assert(!VF.isScalar() &&
6201          "Trying to set a vectorization decision for a scalar VF");
6202 
6203   auto ForcedScalar = ForcedScalars.find(VF);
6204   for (BasicBlock *BB : TheLoop->blocks()) {
6205     // For each instruction in the old loop.
6206     for (Instruction &I : *BB) {
6207       CallInst *CI = dyn_cast<CallInst>(&I);
6208 
6209       if (!CI)
6210         continue;
6211 
6212       InstructionCost ScalarCost = InstructionCost::getInvalid();
6213       InstructionCost VectorCost = InstructionCost::getInvalid();
6214       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6215       Function *ScalarFunc = CI->getCalledFunction();
6216       Type *ScalarRetTy = CI->getType();
6217       SmallVector<Type *, 4> Tys, ScalarTys;
6218       for (auto &ArgOp : CI->args())
6219         ScalarTys.push_back(ArgOp->getType());
6220 
6221       // Estimate cost of scalarized vector call. The source operands are
6222       // assumed to be vectors, so we need to extract individual elements from
6223       // there, execute VF scalar calls, and then gather the result into the
6224       // vector return value.
6225       InstructionCost ScalarCallCost =
6226           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6227 
6228       // Compute costs of unpacking argument values for the scalar calls and
6229       // packing the return values to a vector.
6230       InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
6231 
6232       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6233       // Honor ForcedScalars and UniformAfterVectorization decisions.
6234       // TODO: For calls, it might still be more profitable to widen. Use
6235       // VPlan-based cost model to compare different options.
6236       if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6237                              ForcedScalar->second.contains(CI)) ||
6238                             isUniformAfterVectorization(CI, VF))) {
6239         setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6240                                 Intrinsic::not_intrinsic, std::nullopt,
6241                                 ScalarCost);
6242         continue;
6243       }
6244 
6245       bool MaskRequired = Legal->isMaskRequired(CI);
6246       // Compute corresponding vector type for return value and arguments.
6247       Type *RetTy = toVectorTy(ScalarRetTy, VF);
6248       for (Type *ScalarTy : ScalarTys)
6249         Tys.push_back(toVectorTy(ScalarTy, VF));
6250 
6251       // An in-loop reduction using an fmuladd intrinsic is a special case;
6252       // we don't want the normal cost for that intrinsic.
6253       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6254         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
6255           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6256                                   getVectorIntrinsicIDForCall(CI, TLI),
6257                                   std::nullopt, *RedCost);
6258           continue;
6259         }
6260 
6261       // Find the cost of vectorizing the call, if we can find a suitable
6262       // vector variant of the function.
6263       bool UsesMask = false;
6264       VFInfo FuncInfo;
6265       Function *VecFunc = nullptr;
6266       // Search through any available variants for one we can use at this VF.
6267       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6268         // Must match requested VF.
6269         if (Info.Shape.VF != VF)
6270           continue;
6271 
6272         // Must take a mask argument if one is required
6273         if (MaskRequired && !Info.isMasked())
6274           continue;
6275 
6276         // Check that all parameter kinds are supported
6277         bool ParamsOk = true;
6278         for (VFParameter Param : Info.Shape.Parameters) {
6279           switch (Param.ParamKind) {
6280           case VFParamKind::Vector:
6281             break;
6282           case VFParamKind::OMP_Uniform: {
6283             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6284             // Make sure the scalar parameter in the loop is invariant.
6285             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6286                                               TheLoop))
6287               ParamsOk = false;
6288             break;
6289           }
6290           case VFParamKind::OMP_Linear: {
6291             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6292             // Find the stride for the scalar parameter in this loop and see if
6293             // it matches the stride for the variant.
6294             // TODO: do we need to figure out the cost of an extract to get the
6295             // first lane? Or do we hope that it will be folded away?
6296             ScalarEvolution *SE = PSE.getSE();
6297             const auto *SAR =
6298                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6299 
6300             if (!SAR || SAR->getLoop() != TheLoop) {
6301               ParamsOk = false;
6302               break;
6303             }
6304 
6305             const SCEVConstant *Step =
6306                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6307 
6308             if (!Step ||
6309                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6310               ParamsOk = false;
6311 
6312             break;
6313           }
6314           case VFParamKind::GlobalPredicate:
6315             UsesMask = true;
6316             break;
6317           default:
6318             ParamsOk = false;
6319             break;
6320           }
6321         }
6322 
6323         if (!ParamsOk)
6324           continue;
6325 
6326         // Found a suitable candidate, stop here.
6327         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6328         FuncInfo = Info;
6329         break;
6330       }
6331 
6332       // Add in the cost of synthesizing a mask if one wasn't required.
6333       InstructionCost MaskCost = 0;
6334       if (VecFunc && UsesMask && !MaskRequired)
6335         MaskCost = TTI.getShuffleCost(
6336             TargetTransformInfo::SK_Broadcast,
6337             VectorType::get(IntegerType::getInt1Ty(
6338                                 VecFunc->getFunctionType()->getContext()),
6339                             VF),
6340             {}, CostKind);
6341 
6342       if (TLI && VecFunc && !CI->isNoBuiltin())
6343         VectorCost =
6344             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6345 
6346       // Find the cost of an intrinsic; some targets may have instructions that
6347       // perform the operation without needing an actual call.
6348       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6349       if (IID != Intrinsic::not_intrinsic)
6350         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6351 
6352       InstructionCost Cost = ScalarCost;
6353       InstWidening Decision = CM_Scalarize;
6354 
6355       if (VectorCost <= Cost) {
6356         Cost = VectorCost;
6357         Decision = CM_VectorCall;
6358       }
6359 
6360       if (IntrinsicCost <= Cost) {
6361         Cost = IntrinsicCost;
6362         Decision = CM_IntrinsicCall;
6363       }
6364 
6365       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6366                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6367     }
6368   }
6369 }
6370 
6371 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6372   if (!Legal->isInvariant(Op))
6373     return false;
6374   // Consider Op invariant, if it or its operands aren't predicated
6375   // instruction in the loop. In that case, it is not trivially hoistable.
6376   auto *OpI = dyn_cast<Instruction>(Op);
6377   return !OpI || !TheLoop->contains(OpI) ||
6378          (!isPredicatedInst(OpI) &&
6379           (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6380           all_of(OpI->operands(),
6381                  [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6382 }
6383 
6384 InstructionCost
6385 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6386                                                ElementCount VF) {
6387   // If we know that this instruction will remain uniform, check the cost of
6388   // the scalar version.
6389   if (isUniformAfterVectorization(I, VF))
6390     VF = ElementCount::getFixed(1);
6391 
6392   if (VF.isVector() && isProfitableToScalarize(I, VF))
6393     return InstsToScalarize[VF][I];
6394 
6395   // Forced scalars do not have any scalarization overhead.
6396   auto ForcedScalar = ForcedScalars.find(VF);
6397   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6398     auto InstSet = ForcedScalar->second;
6399     if (InstSet.count(I))
6400       return getInstructionCost(I, ElementCount::getFixed(1)) *
6401              VF.getKnownMinValue();
6402   }
6403 
6404   Type *RetTy = I->getType();
6405   if (canTruncateToMinimalBitwidth(I, VF))
6406     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6407   auto *SE = PSE.getSE();
6408 
6409   auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6410                                                 ElementCount VF) -> bool {
6411     if (VF.isScalar())
6412       return true;
6413 
6414     auto Scalarized = InstsToScalarize.find(VF);
6415     assert(Scalarized != InstsToScalarize.end() &&
6416            "VF not yet analyzed for scalarization profitability");
6417     return !Scalarized->second.count(I) &&
6418            llvm::all_of(I->users(), [&](User *U) {
6419              auto *UI = cast<Instruction>(U);
6420              return !Scalarized->second.count(UI);
6421            });
6422   };
6423   (void)HasSingleCopyAfterVectorization;
6424 
6425   Type *VectorTy;
6426   if (isScalarAfterVectorization(I, VF)) {
6427     // With the exception of GEPs and PHIs, after scalarization there should
6428     // only be one copy of the instruction generated in the loop. This is
6429     // because the VF is either 1, or any instructions that need scalarizing
6430     // have already been dealt with by the time we get here. As a result,
6431     // it means we don't have to multiply the instruction cost by VF.
6432     assert(I->getOpcode() == Instruction::GetElementPtr ||
6433            I->getOpcode() == Instruction::PHI ||
6434            (I->getOpcode() == Instruction::BitCast &&
6435             I->getType()->isPointerTy()) ||
6436            HasSingleCopyAfterVectorization(I, VF));
6437     VectorTy = RetTy;
6438   } else
6439     VectorTy = toVectorTy(RetTy, VF);
6440 
6441   if (VF.isVector() && VectorTy->isVectorTy() &&
6442       !TTI.getNumberOfParts(VectorTy))
6443     return InstructionCost::getInvalid();
6444 
6445   // TODO: We need to estimate the cost of intrinsic calls.
6446   switch (I->getOpcode()) {
6447   case Instruction::GetElementPtr:
6448     // We mark this instruction as zero-cost because the cost of GEPs in
6449     // vectorized code depends on whether the corresponding memory instruction
6450     // is scalarized or not. Therefore, we handle GEPs with the memory
6451     // instruction cost.
6452     return 0;
6453   case Instruction::Br: {
6454     // In cases of scalarized and predicated instructions, there will be VF
6455     // predicated blocks in the vectorized loop. Each branch around these
6456     // blocks requires also an extract of its vector compare i1 element.
6457     // Note that the conditional branch from the loop latch will be replaced by
6458     // a single branch controlling the loop, so there is no extra overhead from
6459     // scalarization.
6460     bool ScalarPredicatedBB = false;
6461     BranchInst *BI = cast<BranchInst>(I);
6462     if (VF.isVector() && BI->isConditional() &&
6463         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6464          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6465         BI->getParent() != TheLoop->getLoopLatch())
6466       ScalarPredicatedBB = true;
6467 
6468     if (ScalarPredicatedBB) {
6469       // Not possible to scalarize scalable vector with predicated instructions.
6470       if (VF.isScalable())
6471         return InstructionCost::getInvalid();
6472       // Return cost for branches around scalarized and predicated blocks.
6473       auto *VecI1Ty =
6474           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6475       return (
6476           TTI.getScalarizationOverhead(
6477               VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6478               /*Insert*/ false, /*Extract*/ true, CostKind) +
6479           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6480     }
6481 
6482     if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6483       // The back-edge branch will remain, as will all scalar branches.
6484       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6485 
6486     // This branch will be eliminated by if-conversion.
6487     return 0;
6488     // Note: We currently assume zero cost for an unconditional branch inside
6489     // a predicated block since it will become a fall-through, although we
6490     // may decide in the future to call TTI for all branches.
6491   }
6492   case Instruction::Switch: {
6493     if (VF.isScalar())
6494       return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6495     auto *Switch = cast<SwitchInst>(I);
6496     return Switch->getNumCases() *
6497            TTI.getCmpSelInstrCost(
6498                Instruction::ICmp,
6499                toVectorTy(Switch->getCondition()->getType(), VF),
6500                toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6501                CmpInst::ICMP_EQ, CostKind);
6502   }
6503   case Instruction::PHI: {
6504     auto *Phi = cast<PHINode>(I);
6505 
6506     // First-order recurrences are replaced by vector shuffles inside the loop.
6507     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6508       // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6509       // penultimate value of the recurrence.
6510       // TODO: Consider vscale_range info.
6511       if (VF.isScalable() && VF.getKnownMinValue() == 1)
6512         return InstructionCost::getInvalid();
6513       SmallVector<int> Mask(VF.getKnownMinValue());
6514       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6515       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6516                                 cast<VectorType>(VectorTy), Mask, CostKind,
6517                                 VF.getKnownMinValue() - 1);
6518     }
6519 
6520     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6521     // converted into select instructions. We require N - 1 selects per phi
6522     // node, where N is the number of incoming values.
6523     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6524       Type *ResultTy = Phi->getType();
6525 
6526       // All instructions in an Any-of reduction chain are narrowed to bool.
6527       // Check if that is the case for this phi node.
6528       auto *HeaderUser = cast_if_present<PHINode>(
6529           find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6530             auto *Phi = dyn_cast<PHINode>(U);
6531             if (Phi && Phi->getParent() == TheLoop->getHeader())
6532               return Phi;
6533             return nullptr;
6534           }));
6535       if (HeaderUser) {
6536         auto &ReductionVars = Legal->getReductionVars();
6537         auto Iter = ReductionVars.find(HeaderUser);
6538         if (Iter != ReductionVars.end() &&
6539             RecurrenceDescriptor::isAnyOfRecurrenceKind(
6540                 Iter->second.getRecurrenceKind()))
6541           ResultTy = Type::getInt1Ty(Phi->getContext());
6542       }
6543       return (Phi->getNumIncomingValues() - 1) *
6544              TTI.getCmpSelInstrCost(
6545                  Instruction::Select, toVectorTy(ResultTy, VF),
6546                  toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6547                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6548     }
6549 
6550     // When tail folding with EVL, if the phi is part of an out of loop
6551     // reduction then it will be transformed into a wide vp_merge.
6552     if (VF.isVector() && foldTailWithEVL() &&
6553         Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6554       IntrinsicCostAttributes ICA(
6555           Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6556           {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6557       return TTI.getIntrinsicInstrCost(ICA, CostKind);
6558     }
6559 
6560     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6561   }
6562   case Instruction::UDiv:
6563   case Instruction::SDiv:
6564   case Instruction::URem:
6565   case Instruction::SRem:
6566     if (VF.isVector() && isPredicatedInst(I)) {
6567       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6568       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6569         ScalarCost : SafeDivisorCost;
6570     }
6571     // We've proven all lanes safe to speculate, fall through.
6572     [[fallthrough]];
6573   case Instruction::Add:
6574   case Instruction::Sub: {
6575     auto Info = Legal->getHistogramInfo(I);
6576     if (Info && VF.isVector()) {
6577       const HistogramInfo *HGram = Info.value();
6578       // Assume that a non-constant update value (or a constant != 1) requires
6579       // a multiply, and add that into the cost.
6580       InstructionCost MulCost = TTI::TCC_Free;
6581       ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6582       if (!RHS || RHS->getZExtValue() != 1)
6583         MulCost =
6584             TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6585 
6586       // Find the cost of the histogram operation itself.
6587       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6588       Type *ScalarTy = I->getType();
6589       Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6590       IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6591                                   Type::getVoidTy(I->getContext()),
6592                                   {PtrTy, ScalarTy, MaskTy});
6593 
6594       // Add the costs together with the add/sub operation.
6595       return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6596              TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6597     }
6598     [[fallthrough]];
6599   }
6600   case Instruction::FAdd:
6601   case Instruction::FSub:
6602   case Instruction::Mul:
6603   case Instruction::FMul:
6604   case Instruction::FDiv:
6605   case Instruction::FRem:
6606   case Instruction::Shl:
6607   case Instruction::LShr:
6608   case Instruction::AShr:
6609   case Instruction::And:
6610   case Instruction::Or:
6611   case Instruction::Xor: {
6612     // If we're speculating on the stride being 1, the multiplication may
6613     // fold away.  We can generalize this for all operations using the notion
6614     // of neutral elements.  (TODO)
6615     if (I->getOpcode() == Instruction::Mul &&
6616         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6617          PSE.getSCEV(I->getOperand(1))->isOne()))
6618       return 0;
6619 
6620     // Detect reduction patterns
6621     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6622       return *RedCost;
6623 
6624     // Certain instructions can be cheaper to vectorize if they have a constant
6625     // second vector operand. One example of this are shifts on x86.
6626     Value *Op2 = I->getOperand(1);
6627     if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6628         isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6629       Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6630     }
6631     auto Op2Info = TTI.getOperandInfo(Op2);
6632     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6633         shouldConsiderInvariant(Op2))
6634       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6635 
6636     SmallVector<const Value *, 4> Operands(I->operand_values());
6637     return TTI.getArithmeticInstrCost(
6638         I->getOpcode(), VectorTy, CostKind,
6639         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6640         Op2Info, Operands, I, TLI);
6641   }
6642   case Instruction::FNeg: {
6643     return TTI.getArithmeticInstrCost(
6644         I->getOpcode(), VectorTy, CostKind,
6645         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6646         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6647         I->getOperand(0), I);
6648   }
6649   case Instruction::Select: {
6650     SelectInst *SI = cast<SelectInst>(I);
6651     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6652     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6653 
6654     const Value *Op0, *Op1;
6655     using namespace llvm::PatternMatch;
6656     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6657                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6658       // select x, y, false --> x & y
6659       // select x, true, y --> x | y
6660       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6661       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6662       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6663               Op1->getType()->getScalarSizeInBits() == 1);
6664 
6665       SmallVector<const Value *, 2> Operands{Op0, Op1};
6666       return TTI.getArithmeticInstrCost(
6667           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6668           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6669     }
6670 
6671     Type *CondTy = SI->getCondition()->getType();
6672     if (!ScalarCond)
6673       CondTy = VectorType::get(CondTy, VF);
6674 
6675     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6676     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6677       Pred = Cmp->getPredicate();
6678     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6679                                   CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6680                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6681   }
6682   case Instruction::ICmp:
6683   case Instruction::FCmp: {
6684     Type *ValTy = I->getOperand(0)->getType();
6685 
6686     if (canTruncateToMinimalBitwidth(I, VF)) {
6687       Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6688       (void)Op0AsInstruction;
6689       assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6690               MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6691              "if both the operand and the compare are marked for "
6692              "truncation, they must have the same bitwidth");
6693       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6694     }
6695 
6696     VectorTy = toVectorTy(ValTy, VF);
6697     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6698                                   cast<CmpInst>(I)->getPredicate(), CostKind,
6699                                   {TTI::OK_AnyValue, TTI::OP_None},
6700                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6701   }
6702   case Instruction::Store:
6703   case Instruction::Load: {
6704     ElementCount Width = VF;
6705     if (Width.isVector()) {
6706       InstWidening Decision = getWideningDecision(I, Width);
6707       assert(Decision != CM_Unknown &&
6708              "CM decision should be taken at this point");
6709       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6710         return InstructionCost::getInvalid();
6711       if (Decision == CM_Scalarize)
6712         Width = ElementCount::getFixed(1);
6713     }
6714     VectorTy = toVectorTy(getLoadStoreType(I), Width);
6715     return getMemoryInstructionCost(I, VF);
6716   }
6717   case Instruction::BitCast:
6718     if (I->getType()->isPointerTy())
6719       return 0;
6720     [[fallthrough]];
6721   case Instruction::ZExt:
6722   case Instruction::SExt:
6723   case Instruction::FPToUI:
6724   case Instruction::FPToSI:
6725   case Instruction::FPExt:
6726   case Instruction::PtrToInt:
6727   case Instruction::IntToPtr:
6728   case Instruction::SIToFP:
6729   case Instruction::UIToFP:
6730   case Instruction::Trunc:
6731   case Instruction::FPTrunc: {
6732     // Computes the CastContextHint from a Load/Store instruction.
6733     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6734       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6735              "Expected a load or a store!");
6736 
6737       if (VF.isScalar() || !TheLoop->contains(I))
6738         return TTI::CastContextHint::Normal;
6739 
6740       switch (getWideningDecision(I, VF)) {
6741       case LoopVectorizationCostModel::CM_GatherScatter:
6742         return TTI::CastContextHint::GatherScatter;
6743       case LoopVectorizationCostModel::CM_Interleave:
6744         return TTI::CastContextHint::Interleave;
6745       case LoopVectorizationCostModel::CM_Scalarize:
6746       case LoopVectorizationCostModel::CM_Widen:
6747         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6748                                         : TTI::CastContextHint::Normal;
6749       case LoopVectorizationCostModel::CM_Widen_Reverse:
6750         return TTI::CastContextHint::Reversed;
6751       case LoopVectorizationCostModel::CM_Unknown:
6752         llvm_unreachable("Instr did not go through cost modelling?");
6753       case LoopVectorizationCostModel::CM_VectorCall:
6754       case LoopVectorizationCostModel::CM_IntrinsicCall:
6755         llvm_unreachable_internal("Instr has invalid widening decision");
6756       }
6757 
6758       llvm_unreachable("Unhandled case!");
6759     };
6760 
6761     unsigned Opcode = I->getOpcode();
6762     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6763     // For Trunc, the context is the only user, which must be a StoreInst.
6764     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6765       if (I->hasOneUse())
6766         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6767           CCH = ComputeCCH(Store);
6768     }
6769     // For Z/Sext, the context is the operand, which must be a LoadInst.
6770     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6771              Opcode == Instruction::FPExt) {
6772       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6773         CCH = ComputeCCH(Load);
6774     }
6775 
6776     // We optimize the truncation of induction variables having constant
6777     // integer steps. The cost of these truncations is the same as the scalar
6778     // operation.
6779     if (isOptimizableIVTruncate(I, VF)) {
6780       auto *Trunc = cast<TruncInst>(I);
6781       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6782                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6783     }
6784 
6785     // Detect reduction patterns
6786     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6787       return *RedCost;
6788 
6789     Type *SrcScalarTy = I->getOperand(0)->getType();
6790     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6791     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6792       SrcScalarTy =
6793           IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6794     Type *SrcVecTy =
6795         VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6796 
6797     if (canTruncateToMinimalBitwidth(I, VF)) {
6798       // If the result type is <= the source type, there will be no extend
6799       // after truncating the users to the minimal required bitwidth.
6800       if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6801           (I->getOpcode() == Instruction::ZExt ||
6802            I->getOpcode() == Instruction::SExt))
6803         return 0;
6804     }
6805 
6806     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6807   }
6808   case Instruction::Call:
6809     return getVectorCallCost(cast<CallInst>(I), VF);
6810   case Instruction::ExtractValue:
6811     return TTI.getInstructionCost(I, CostKind);
6812   case Instruction::Alloca:
6813     // We cannot easily widen alloca to a scalable alloca, as
6814     // the result would need to be a vector of pointers.
6815     if (VF.isScalable())
6816       return InstructionCost::getInvalid();
6817     [[fallthrough]];
6818   default:
6819     // This opcode is unknown. Assume that it is the same as 'mul'.
6820     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6821   } // end of switch.
6822 }
6823 
6824 void LoopVectorizationCostModel::collectValuesToIgnore() {
6825   // Ignore ephemeral values.
6826   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6827 
6828   SmallVector<Value *, 4> DeadInterleavePointerOps;
6829   SmallVector<Value *, 4> DeadOps;
6830 
6831   // If a scalar epilogue is required, users outside the loop won't use
6832   // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6833   // that is the case.
6834   bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6835   auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6836     return RequiresScalarEpilogue &&
6837            !TheLoop->contains(cast<Instruction>(U)->getParent());
6838   };
6839 
6840   LoopBlocksDFS DFS(TheLoop);
6841   DFS.perform(LI);
6842   MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6843   for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6844     for (Instruction &I : reverse(*BB)) {
6845       // Find all stores to invariant variables. Since they are going to sink
6846       // outside the loop we do not need calculate cost for them.
6847       StoreInst *SI;
6848       if ((SI = dyn_cast<StoreInst>(&I)) &&
6849           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6850         ValuesToIgnore.insert(&I);
6851         DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6852             SI->getValueOperand());
6853       }
6854 
6855       if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6856         continue;
6857 
6858       // Add instructions that would be trivially dead and are only used by
6859       // values already ignored to DeadOps to seed worklist.
6860       if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6861           all_of(I.users(), [this, IsLiveOutDead](User *U) {
6862             return VecValuesToIgnore.contains(U) ||
6863                    ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6864           }))
6865         DeadOps.push_back(&I);
6866 
6867       // For interleave groups, we only create a pointer for the start of the
6868       // interleave group. Queue up addresses of group members except the insert
6869       // position for further processing.
6870       if (isAccessInterleaved(&I)) {
6871         auto *Group = getInterleavedAccessGroup(&I);
6872         if (Group->getInsertPos() == &I)
6873           continue;
6874         Value *PointerOp = getLoadStorePointerOperand(&I);
6875         DeadInterleavePointerOps.push_back(PointerOp);
6876       }
6877 
6878       // Queue branches for analysis. They are dead, if their successors only
6879       // contain dead instructions.
6880       if (auto *Br = dyn_cast<BranchInst>(&I)) {
6881         if (Br->isConditional())
6882           DeadOps.push_back(&I);
6883       }
6884     }
6885 
6886   // Mark ops feeding interleave group members as free, if they are only used
6887   // by other dead computations.
6888   for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6889     auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6890     if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6891           Instruction *UI = cast<Instruction>(U);
6892           return !VecValuesToIgnore.contains(U) &&
6893                  (!isAccessInterleaved(UI) ||
6894                   getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6895         }))
6896       continue;
6897     VecValuesToIgnore.insert(Op);
6898     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6899   }
6900 
6901   for (const auto &[_, Ops] : DeadInvariantStoreOps) {
6902     for (Value *Op : ArrayRef(Ops).drop_back())
6903       DeadOps.push_back(Op);
6904   }
6905   // Mark ops that would be trivially dead and are only used by ignored
6906   // instructions as free.
6907   BasicBlock *Header = TheLoop->getHeader();
6908 
6909   // Returns true if the block contains only dead instructions. Such blocks will
6910   // be removed by VPlan-to-VPlan transforms and won't be considered by the
6911   // VPlan-based cost model, so skip them in the legacy cost-model as well.
6912   auto IsEmptyBlock = [this](BasicBlock *BB) {
6913     return all_of(*BB, [this](Instruction &I) {
6914       return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6915              (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6916     });
6917   };
6918   for (unsigned I = 0; I != DeadOps.size(); ++I) {
6919     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6920 
6921     // Check if the branch should be considered dead.
6922     if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6923       BasicBlock *ThenBB = Br->getSuccessor(0);
6924       BasicBlock *ElseBB = Br->getSuccessor(1);
6925       // Don't considers branches leaving the loop for simplification.
6926       if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6927         continue;
6928       bool ThenEmpty = IsEmptyBlock(ThenBB);
6929       bool ElseEmpty = IsEmptyBlock(ElseBB);
6930       if ((ThenEmpty && ElseEmpty) ||
6931           (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6932            ElseBB->phis().empty()) ||
6933           (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6934            ThenBB->phis().empty())) {
6935         VecValuesToIgnore.insert(Br);
6936         DeadOps.push_back(Br->getCondition());
6937       }
6938       continue;
6939     }
6940 
6941     // Skip any op that shouldn't be considered dead.
6942     if (!Op || !TheLoop->contains(Op) ||
6943         (isa<PHINode>(Op) && Op->getParent() == Header) ||
6944         !wouldInstructionBeTriviallyDead(Op, TLI) ||
6945         any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6946           return !VecValuesToIgnore.contains(U) &&
6947                  !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6948         }))
6949       continue;
6950 
6951     if (!TheLoop->contains(Op->getParent()))
6952       continue;
6953 
6954     // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6955     // which applies for both scalar and vector versions. Otherwise it is only
6956     // dead in vector versions, so only add it to VecValuesToIgnore.
6957     if (all_of(Op->users(),
6958                [this](User *U) { return ValuesToIgnore.contains(U); }))
6959       ValuesToIgnore.insert(Op);
6960 
6961     VecValuesToIgnore.insert(Op);
6962     DeadOps.append(Op->op_begin(), Op->op_end());
6963   }
6964 
6965   // Ignore type-promoting instructions we identified during reduction
6966   // detection.
6967   for (const auto &Reduction : Legal->getReductionVars()) {
6968     const RecurrenceDescriptor &RedDes = Reduction.second;
6969     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6970     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6971   }
6972   // Ignore type-casting instructions we identified during induction
6973   // detection.
6974   for (const auto &Induction : Legal->getInductionVars()) {
6975     const InductionDescriptor &IndDes = Induction.second;
6976     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6977     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6978   }
6979 }
6980 
6981 void LoopVectorizationCostModel::collectInLoopReductions() {
6982   for (const auto &Reduction : Legal->getReductionVars()) {
6983     PHINode *Phi = Reduction.first;
6984     const RecurrenceDescriptor &RdxDesc = Reduction.second;
6985 
6986     // We don't collect reductions that are type promoted (yet).
6987     if (RdxDesc.getRecurrenceType() != Phi->getType())
6988       continue;
6989 
6990     // If the target would prefer this reduction to happen "in-loop", then we
6991     // want to record it as such.
6992     unsigned Opcode = RdxDesc.getOpcode();
6993     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6994         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6995                                    TargetTransformInfo::ReductionFlags()))
6996       continue;
6997 
6998     // Check that we can correctly put the reductions into the loop, by
6999     // finding the chain of operations that leads from the phi to the loop
7000     // exit value.
7001     SmallVector<Instruction *, 4> ReductionOperations =
7002         RdxDesc.getReductionOpChain(Phi, TheLoop);
7003     bool InLoop = !ReductionOperations.empty();
7004 
7005     if (InLoop) {
7006       InLoopReductions.insert(Phi);
7007       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7008       Instruction *LastChain = Phi;
7009       for (auto *I : ReductionOperations) {
7010         InLoopReductionImmediateChains[I] = LastChain;
7011         LastChain = I;
7012       }
7013     }
7014     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7015                       << " reduction for phi: " << *Phi << "\n");
7016   }
7017 }
7018 
7019 // This function will select a scalable VF if the target supports scalable
7020 // vectors and a fixed one otherwise.
7021 // TODO: we could return a pair of values that specify the max VF and
7022 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7023 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7024 // doesn't have a cost model that can choose which plan to execute if
7025 // more than one is generated.
7026 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7027                                      LoopVectorizationCostModel &CM) {
7028   unsigned WidestType;
7029   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7030 
7031   TargetTransformInfo::RegisterKind RegKind =
7032       TTI.enableScalableVectorization()
7033           ? TargetTransformInfo::RGK_ScalableVector
7034           : TargetTransformInfo::RGK_FixedWidthVector;
7035 
7036   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7037   unsigned N = RegSize.getKnownMinValue() / WidestType;
7038   return ElementCount::get(N, RegSize.isScalable());
7039 }
7040 
7041 VectorizationFactor
7042 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7043   ElementCount VF = UserVF;
7044   // Outer loop handling: They may require CFG and instruction level
7045   // transformations before even evaluating whether vectorization is profitable.
7046   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7047   // the vectorization pipeline.
7048   if (!OrigLoop->isInnermost()) {
7049     // If the user doesn't provide a vectorization factor, determine a
7050     // reasonable one.
7051     if (UserVF.isZero()) {
7052       VF = determineVPlanVF(TTI, CM);
7053       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7054 
7055       // Make sure we have a VF > 1 for stress testing.
7056       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7057         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7058                           << "overriding computed VF.\n");
7059         VF = ElementCount::getFixed(4);
7060       }
7061     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7062                !ForceTargetSupportsScalableVectors) {
7063       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7064                         << "not supported by the target.\n");
7065       reportVectorizationFailure(
7066           "Scalable vectorization requested but not supported by the target",
7067           "the scalable user-specified vectorization width for outer-loop "
7068           "vectorization cannot be used because the target does not support "
7069           "scalable vectors.",
7070           "ScalableVFUnfeasible", ORE, OrigLoop);
7071       return VectorizationFactor::Disabled();
7072     }
7073     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7074     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7075            "VF needs to be a power of two");
7076     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7077                       << "VF " << VF << " to build VPlans.\n");
7078     buildVPlans(VF, VF);
7079 
7080     // For VPlan build stress testing, we bail out after VPlan construction.
7081     if (VPlanBuildStressTest)
7082       return VectorizationFactor::Disabled();
7083 
7084     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7085   }
7086 
7087   LLVM_DEBUG(
7088       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7089                 "VPlan-native path.\n");
7090   return VectorizationFactor::Disabled();
7091 }
7092 
7093 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7094   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7095   CM.collectValuesToIgnore();
7096   CM.collectElementTypesForWidening();
7097 
7098   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7099   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7100     return;
7101 
7102   // Invalidate interleave groups if all blocks of loop will be predicated.
7103   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7104       !useMaskedInterleavedAccesses(TTI)) {
7105     LLVM_DEBUG(
7106         dbgs()
7107         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7108            "which requires masked-interleaved support.\n");
7109     if (CM.InterleaveInfo.invalidateGroups())
7110       // Invalidating interleave groups also requires invalidating all decisions
7111       // based on them, which includes widening decisions and uniform and scalar
7112       // values.
7113       CM.invalidateCostModelingDecisions();
7114   }
7115 
7116   if (CM.foldTailByMasking())
7117     Legal->prepareToFoldTailByMasking();
7118 
7119   ElementCount MaxUserVF =
7120       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7121   if (UserVF) {
7122     if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7123       reportVectorizationInfo(
7124           "UserVF ignored because it may be larger than the maximal safe VF",
7125           "InvalidUserVF", ORE, OrigLoop);
7126     } else {
7127       assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7128              "VF needs to be a power of two");
7129       // Collect the instructions (and their associated costs) that will be more
7130       // profitable to scalarize.
7131       CM.collectInLoopReductions();
7132       if (CM.selectUserVectorizationFactor(UserVF)) {
7133         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7134         buildVPlansWithVPRecipes(UserVF, UserVF);
7135         LLVM_DEBUG(printPlans(dbgs()));
7136         return;
7137       }
7138       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7139                               "InvalidCost", ORE, OrigLoop);
7140     }
7141   }
7142 
7143   // Collect the Vectorization Factor Candidates.
7144   SmallVector<ElementCount> VFCandidates;
7145   for (auto VF = ElementCount::getFixed(1);
7146        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7147     VFCandidates.push_back(VF);
7148   for (auto VF = ElementCount::getScalable(1);
7149        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7150     VFCandidates.push_back(VF);
7151 
7152   CM.collectInLoopReductions();
7153   for (const auto &VF : VFCandidates) {
7154     // Collect Uniform and Scalar instructions after vectorization with VF.
7155     CM.collectUniformsAndScalars(VF);
7156 
7157     // Collect the instructions (and their associated costs) that will be more
7158     // profitable to scalarize.
7159     if (VF.isVector())
7160       CM.collectInstsToScalarize(VF);
7161   }
7162 
7163   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7164   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7165 
7166   LLVM_DEBUG(printPlans(dbgs()));
7167 }
7168 
7169 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7170                                              ElementCount VF) const {
7171   if (ForceTargetInstructionCost.getNumOccurrences())
7172     return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7173   return CM.getInstructionCost(UI, VF);
7174 }
7175 
7176 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7177   return CM.ValuesToIgnore.contains(UI) ||
7178          (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7179          SkipCostComputation.contains(UI);
7180 }
7181 
7182 InstructionCost
7183 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7184                                           VPCostContext &CostCtx) const {
7185   InstructionCost Cost;
7186   // Cost modeling for inductions is inaccurate in the legacy cost model
7187   // compared to the recipes that are generated. To match here initially during
7188   // VPlan cost model bring up directly use the induction costs from the legacy
7189   // cost model. Note that we do this as pre-processing; the VPlan may not have
7190   // any recipes associated with the original induction increment instruction
7191   // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7192   // the cost of induction phis and increments (both that are represented by
7193   // recipes and those that are not), to avoid distinguishing between them here,
7194   // and skip all recipes that represent induction phis and increments (the
7195   // former case) later on, if they exist, to avoid counting them twice.
7196   // Similarly we pre-compute the cost of any optimized truncates.
7197   // TODO: Switch to more accurate costing based on VPlan.
7198   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7199     Instruction *IVInc = cast<Instruction>(
7200         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7201     SmallVector<Instruction *> IVInsts = {IVInc};
7202     for (unsigned I = 0; I != IVInsts.size(); I++) {
7203       for (Value *Op : IVInsts[I]->operands()) {
7204         auto *OpI = dyn_cast<Instruction>(Op);
7205         if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7206           continue;
7207         IVInsts.push_back(OpI);
7208       }
7209     }
7210     IVInsts.push_back(IV);
7211     for (User *U : IV->users()) {
7212       auto *CI = cast<Instruction>(U);
7213       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7214         continue;
7215       IVInsts.push_back(CI);
7216     }
7217 
7218     // If the vector loop gets executed exactly once with the given VF, ignore
7219     // the costs of comparison and induction instructions, as they'll get
7220     // simplified away.
7221     // TODO: Remove this code after stepping away from the legacy cost model and
7222     // adding code to simplify VPlans before calculating their costs.
7223     auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7224     if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7225       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
7226                                            CostCtx.SkipCostComputation);
7227 
7228     for (Instruction *IVInst : IVInsts) {
7229       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7230         continue;
7231       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7232       LLVM_DEBUG({
7233         dbgs() << "Cost of " << InductionCost << " for VF " << VF
7234                << ": induction instruction " << *IVInst << "\n";
7235       });
7236       Cost += InductionCost;
7237       CostCtx.SkipCostComputation.insert(IVInst);
7238     }
7239   }
7240 
7241   /// Compute the cost of all exiting conditions of the loop using the legacy
7242   /// cost model. This is to match the legacy behavior, which adds the cost of
7243   /// all exit conditions. Note that this over-estimates the cost, as there will
7244   /// be a single condition to control the vector loop.
7245   SmallVector<BasicBlock *> Exiting;
7246   CM.TheLoop->getExitingBlocks(Exiting);
7247   SetVector<Instruction *> ExitInstrs;
7248   // Collect all exit conditions.
7249   for (BasicBlock *EB : Exiting) {
7250     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7251     if (!Term)
7252       continue;
7253     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7254       ExitInstrs.insert(CondI);
7255     }
7256   }
7257   // Compute the cost of all instructions only feeding the exit conditions.
7258   for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7259     Instruction *CondI = ExitInstrs[I];
7260     if (!OrigLoop->contains(CondI) ||
7261         !CostCtx.SkipCostComputation.insert(CondI).second)
7262       continue;
7263     InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7264     LLVM_DEBUG({
7265       dbgs() << "Cost of " << CondICost << " for VF " << VF
7266              << ": exit condition instruction " << *CondI << "\n";
7267     });
7268     Cost += CondICost;
7269     for (Value *Op : CondI->operands()) {
7270       auto *OpI = dyn_cast<Instruction>(Op);
7271       if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7272             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7273                    !ExitInstrs.contains(cast<Instruction>(U));
7274           }))
7275         continue;
7276       ExitInstrs.insert(OpI);
7277     }
7278   }
7279 
7280   // The legacy cost model has special logic to compute the cost of in-loop
7281   // reductions, which may be smaller than the sum of all instructions involved
7282   // in the reduction.
7283   // TODO: Switch to costing based on VPlan once the logic has been ported.
7284   for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7285     if (ForceTargetInstructionCost.getNumOccurrences())
7286       continue;
7287 
7288     if (!CM.isInLoopReduction(RedPhi))
7289       continue;
7290 
7291     const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7292     SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7293                                                  ChainOps.end());
7294     auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7295       return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7296     };
7297     // Also include the operands of instructions in the chain, as the cost-model
7298     // may mark extends as free.
7299     //
7300     // For ARM, some of the instruction can folded into the reducion
7301     // instruction. So we need to mark all folded instructions free.
7302     // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7303     // instruction.
7304     for (auto *ChainOp : ChainOps) {
7305       for (Value *Op : ChainOp->operands()) {
7306         if (auto *I = dyn_cast<Instruction>(Op)) {
7307           ChainOpsAndOperands.insert(I);
7308           if (I->getOpcode() == Instruction::Mul) {
7309             auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7310             auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7311             if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7312                 Ext0->getOpcode() == Ext1->getOpcode()) {
7313               ChainOpsAndOperands.insert(Ext0);
7314               ChainOpsAndOperands.insert(Ext1);
7315             }
7316           }
7317         }
7318       }
7319     }
7320 
7321     // Pre-compute the cost for I, if it has a reduction pattern cost.
7322     for (Instruction *I : ChainOpsAndOperands) {
7323       auto ReductionCost =
7324           CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7325       if (!ReductionCost)
7326         continue;
7327 
7328       assert(!CostCtx.SkipCostComputation.contains(I) &&
7329              "reduction op visited multiple times");
7330       CostCtx.SkipCostComputation.insert(I);
7331       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7332                         << ":\n in-loop reduction " << *I << "\n");
7333       Cost += *ReductionCost;
7334     }
7335   }
7336 
7337   // Pre-compute the costs for branches except for the backedge, as the number
7338   // of replicate regions in a VPlan may not directly match the number of
7339   // branches, which would lead to different decisions.
7340   // TODO: Compute cost of branches for each replicate region in the VPlan,
7341   // which is more accurate than the legacy cost model.
7342   for (BasicBlock *BB : OrigLoop->blocks()) {
7343     if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7344       continue;
7345     CostCtx.SkipCostComputation.insert(BB->getTerminator());
7346     if (BB == OrigLoop->getLoopLatch())
7347       continue;
7348     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7349     Cost += BranchCost;
7350   }
7351 
7352   // Pre-compute costs for instructions that are forced-scalar or profitable to
7353   // scalarize. Their costs will be computed separately in the legacy cost
7354   // model.
7355   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7356     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7357       continue;
7358     CostCtx.SkipCostComputation.insert(ForcedScalar);
7359     InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7360     LLVM_DEBUG({
7361       dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7362              << ": forced scalar " << *ForcedScalar << "\n";
7363     });
7364     Cost += ForcedCost;
7365   }
7366   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7367     if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7368       continue;
7369     CostCtx.SkipCostComputation.insert(Scalarized);
7370     LLVM_DEBUG({
7371       dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7372              << ": profitable to scalarize " << *Scalarized << "\n";
7373     });
7374     Cost += ScalarCost;
7375   }
7376 
7377   return Cost;
7378 }
7379 
7380 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7381                                                ElementCount VF) const {
7382   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7383                         CM.CostKind);
7384   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7385 
7386   // Now compute and add the VPlan-based cost.
7387   Cost += Plan.cost(VF, CostCtx);
7388 #ifndef NDEBUG
7389   unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7390   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7391                     << " (Estimated cost per lane: ");
7392   if (Cost.isValid()) {
7393     double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7394     LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7395   } else /* No point dividing an invalid cost - it will still be invalid */
7396     LLVM_DEBUG(dbgs() << "Invalid");
7397   LLVM_DEBUG(dbgs() << ")\n");
7398 #endif
7399   return Cost;
7400 }
7401 
7402 #ifndef NDEBUG
7403 /// Return true if the original loop \ TheLoop contains any instructions that do
7404 /// not have corresponding recipes in \p Plan and are not marked to be ignored
7405 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
7406 /// cost-model did not account for.
7407 static bool planContainsAdditionalSimplifications(VPlan &Plan,
7408                                                   VPCostContext &CostCtx,
7409                                                   Loop *TheLoop) {
7410   // First collect all instructions for the recipes in Plan.
7411   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7412     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7413       return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7414     if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7415       return &WidenMem->getIngredient();
7416     return nullptr;
7417   };
7418 
7419   DenseSet<Instruction *> SeenInstrs;
7420   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7421   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7422     for (VPRecipeBase &R : *VPBB) {
7423       if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7424         auto *IG = IR->getInterleaveGroup();
7425         unsigned NumMembers = IG->getNumMembers();
7426         for (unsigned I = 0; I != NumMembers; ++I) {
7427           if (Instruction *M = IG->getMember(I))
7428             SeenInstrs.insert(M);
7429         }
7430         continue;
7431       }
7432       // The VPlan-based cost model is more accurate for partial reduction and
7433       // comparing against the legacy cost isn't desirable.
7434       if (isa<VPPartialReductionRecipe>(&R))
7435         return true;
7436       if (Instruction *UI = GetInstructionForCost(&R))
7437         SeenInstrs.insert(UI);
7438     }
7439   }
7440 
7441   // Return true if the loop contains any instructions that are not also part of
7442   // the VPlan or are skipped for VPlan-based cost computations. This indicates
7443   // that the VPlan contains extra simplifications.
7444   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7445                                     TheLoop](BasicBlock *BB) {
7446     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7447       if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7448         return false;
7449       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7450     });
7451   });
7452 }
7453 #endif
7454 
7455 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7456   if (VPlans.empty())
7457     return VectorizationFactor::Disabled();
7458   // If there is a single VPlan with a single VF, return it directly.
7459   VPlan &FirstPlan = *VPlans[0];
7460   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7461     return {*FirstPlan.vectorFactors().begin(), 0, 0};
7462 
7463   LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7464                     << (CM.CostKind == TTI::TCK_RecipThroughput
7465                             ? "Reciprocal Throughput\n"
7466                         : CM.CostKind == TTI::TCK_Latency
7467                             ? "Instruction Latency\n"
7468                         : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7469                         : CM.CostKind == TTI::TCK_SizeAndLatency
7470                             ? "Code Size and Latency\n"
7471                             : "Unknown\n"));
7472 
7473   ElementCount ScalarVF = ElementCount::getFixed(1);
7474   assert(hasPlanWithVF(ScalarVF) &&
7475          "More than a single plan/VF w/o any plan having scalar VF");
7476 
7477   // TODO: Compute scalar cost using VPlan-based cost model.
7478   InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7479   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7480   VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7481   VectorizationFactor BestFactor = ScalarFactor;
7482 
7483   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7484   if (ForceVectorization) {
7485     // Ignore scalar width, because the user explicitly wants vectorization.
7486     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7487     // evaluation.
7488     BestFactor.Cost = InstructionCost::getMax();
7489   }
7490 
7491   for (auto &P : VPlans) {
7492     for (ElementCount VF : P->vectorFactors()) {
7493       if (VF.isScalar())
7494         continue;
7495       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7496         LLVM_DEBUG(
7497             dbgs()
7498             << "LV: Not considering vector loop of width " << VF
7499             << " because it will not generate any vector instructions.\n");
7500         continue;
7501       }
7502 
7503       InstructionCost Cost = cost(*P, VF);
7504       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7505       if (isMoreProfitable(CurrentFactor, BestFactor))
7506         BestFactor = CurrentFactor;
7507 
7508       // If profitable add it to ProfitableVF list.
7509       if (isMoreProfitable(CurrentFactor, ScalarFactor))
7510         ProfitableVFs.push_back(CurrentFactor);
7511     }
7512   }
7513 
7514 #ifndef NDEBUG
7515   // Select the optimal vectorization factor according to the legacy cost-model.
7516   // This is now only used to verify the decisions by the new VPlan-based
7517   // cost-model and will be retired once the VPlan-based cost-model is
7518   // stabilized.
7519   VectorizationFactor LegacyVF = selectVectorizationFactor();
7520   VPlan &BestPlan = getPlanFor(BestFactor.Width);
7521 
7522   // Pre-compute the cost and use it to check if BestPlan contains any
7523   // simplifications not accounted for in the legacy cost model. If that's the
7524   // case, don't trigger the assertion, as the extra simplifications may cause a
7525   // different VF to be picked by the VPlan-based cost model.
7526   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7527                         CM.CostKind);
7528   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7529   assert((BestFactor.Width == LegacyVF.Width ||
7530           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7531                                                 CostCtx, OrigLoop) ||
7532           planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
7533                                                 CostCtx, OrigLoop)) &&
7534          " VPlan cost model and legacy cost model disagreed");
7535   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7536          "when vectorizing, the scalar cost must be computed.");
7537 #endif
7538 
7539   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7540   return BestFactor;
7541 }
7542 
7543 static void addRuntimeUnrollDisableMetaData(Loop *L) {
7544   SmallVector<Metadata *, 4> MDs;
7545   // Reserve first location for self reference to the LoopID metadata node.
7546   MDs.push_back(nullptr);
7547   bool IsUnrollMetadata = false;
7548   MDNode *LoopID = L->getLoopID();
7549   if (LoopID) {
7550     // First find existing loop unrolling disable metadata.
7551     for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7552       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7553       if (MD) {
7554         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7555         IsUnrollMetadata =
7556             S && S->getString().starts_with("llvm.loop.unroll.disable");
7557       }
7558       MDs.push_back(LoopID->getOperand(I));
7559     }
7560   }
7561 
7562   if (!IsUnrollMetadata) {
7563     // Add runtime unroll disable metadata.
7564     LLVMContext &Context = L->getHeader()->getContext();
7565     SmallVector<Metadata *, 1> DisableOperands;
7566     DisableOperands.push_back(
7567         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7568     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7569     MDs.push_back(DisableNode);
7570     MDNode *NewLoopID = MDNode::get(Context, MDs);
7571     // Set operand 0 to refer to the loop id itself.
7572     NewLoopID->replaceOperandWith(0, NewLoopID);
7573     L->setLoopID(NewLoopID);
7574   }
7575 }
7576 
7577 // If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7578 // fix the reduction's scalar PHI node by adding the incoming value from the
7579 // main vector loop.
7580 static void fixReductionScalarResumeWhenVectorizingEpilog(
7581     VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7582     BasicBlock *BypassBlock) {
7583   auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7584   if (!EpiRedResult ||
7585       EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7586     return;
7587 
7588   auto *EpiRedHeaderPhi =
7589       cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7590   const RecurrenceDescriptor &RdxDesc =
7591       EpiRedHeaderPhi->getRecurrenceDescriptor();
7592   Value *MainResumeValue =
7593       EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7594   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7595           RdxDesc.getRecurrenceKind())) {
7596     auto *Cmp = cast<ICmpInst>(MainResumeValue);
7597     assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7598            "AnyOf expected to start with ICMP_NE");
7599     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7600            "AnyOf expected to start by comparing main resume value to original "
7601            "start value");
7602     MainResumeValue = Cmp->getOperand(0);
7603   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
7604                  RdxDesc.getRecurrenceKind())) {
7605     using namespace llvm::PatternMatch;
7606     Value *Cmp, *OrigResumeV;
7607     bool IsExpectedPattern =
7608         match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7609                                         m_Specific(RdxDesc.getSentinelValue()),
7610                                         m_Value(OrigResumeV))) &&
7611         match(Cmp,
7612               m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7613                              m_Specific(RdxDesc.getRecurrenceStartValue())));
7614     assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7615     (void)IsExpectedPattern;
7616     MainResumeValue = OrigResumeV;
7617   }
7618   PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7619 
7620   // When fixing reductions in the epilogue loop we should already have
7621   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7622   // over the incoming values correctly.
7623   using namespace VPlanPatternMatch;
7624   auto IsResumePhi = [](VPUser *U) {
7625     return match(
7626         U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7627   };
7628   assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7629          "ResumePhi must have a single user");
7630   auto *EpiResumePhiVPI =
7631       cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7632   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7633   EpiResumePhi->setIncomingValueForBlock(
7634       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7635 }
7636 
7637 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7638     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7639     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7640     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7641   assert(BestVPlan.hasVF(BestVF) &&
7642          "Trying to execute plan with unsupported VF");
7643   assert(BestVPlan.hasUF(BestUF) &&
7644          "Trying to execute plan with unsupported UF");
7645   assert(
7646       ((VectorizingEpilogue && ExpandedSCEVs) ||
7647        (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7648       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7649 
7650   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7651   // cost model is complete for better cost estimates.
7652   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7653                               OrigLoop->getHeader()->getContext());
7654   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7655   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
7656 
7657   // Perform the actual loop transformation.
7658   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7659                          &BestVPlan, OrigLoop->getParentLoop(),
7660                          Legal->getWidestInductionType());
7661 
7662 #ifdef EXPENSIVE_CHECKS
7663   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7664 #endif
7665 
7666   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7667   // making any changes to the CFG.
7668   if (!BestVPlan.getEntry()->empty())
7669     BestVPlan.getEntry()->execute(&State);
7670 
7671   if (!ILV.getTripCount())
7672     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7673   else
7674     assert(VectorizingEpilogue && "should only re-use the existing trip "
7675                                   "count during epilogue vectorization");
7676 
7677   // 1. Set up the skeleton for vectorization, including vector pre-header and
7678   // middle block. The vector loop is created during VPlan execution.
7679   VPBasicBlock *VectorPH =
7680       cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7681   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7682       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7683   if (VectorizingEpilogue)
7684     VPlanTransforms::removeDeadRecipes(BestVPlan);
7685 
7686   // Only use noalias metadata when using memory checks guaranteeing no overlap
7687   // across all iterations.
7688   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7689   std::unique_ptr<LoopVersioning> LVer = nullptr;
7690   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7691       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7692 
7693     //  We currently don't use LoopVersioning for the actual loop cloning but we
7694     //  still use it to add the noalias metadata.
7695     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7696     //        metadata.
7697     LVer = std::make_unique<LoopVersioning>(
7698         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7699         PSE.getSE());
7700     State.LVer = &*LVer;
7701     State.LVer->prepareNoAliasMetadata();
7702   }
7703 
7704   ILV.printDebugTracesAtStart();
7705 
7706   //===------------------------------------------------===//
7707   //
7708   // Notice: any optimization or new instruction that go
7709   // into the code below should also be implemented in
7710   // the cost-model.
7711   //
7712   //===------------------------------------------------===//
7713 
7714   // 2. Copy and widen instructions from the old loop into the new loop.
7715   BestVPlan.prepareToExecute(
7716       ILV.getTripCount(),
7717       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7718   replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7719 
7720   BestVPlan.execute(&State);
7721 
7722   auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7723   // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7724   // values from the additional bypass block.
7725   if (VectorizingEpilogue) {
7726     assert(!ILV.Legal->hasUncountableEarlyExit() &&
7727            "Epilogue vectorisation not yet supported with early exits");
7728     BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7729     for (VPRecipeBase &R : *MiddleVPBB) {
7730       fixReductionScalarResumeWhenVectorizingEpilog(
7731           &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7732     }
7733     BasicBlock *PH = OrigLoop->getLoopPreheader();
7734     for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7735       auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7736       Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
7737       Inc->setIncomingValueForBlock(BypassBlock, V);
7738     }
7739   }
7740 
7741   // 2.6. Maintain Loop Hints
7742   // Keep all loop hints from the original loop on the vector loop (we'll
7743   // replace the vectorizer-specific hints below).
7744   if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7745     MDNode *OrigLoopID = OrigLoop->getLoopID();
7746 
7747     std::optional<MDNode *> VectorizedLoopID =
7748         makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7749                                         LLVMLoopVectorizeFollowupVectorized});
7750 
7751     VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7752     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7753     if (VectorizedLoopID) {
7754       L->setLoopID(*VectorizedLoopID);
7755     } else {
7756       // Keep all loop hints from the original loop on the vector loop (we'll
7757       // replace the vectorizer-specific hints below).
7758       if (MDNode *LID = OrigLoop->getLoopID())
7759         L->setLoopID(LID);
7760 
7761       LoopVectorizeHints Hints(L, true, *ORE);
7762       Hints.setAlreadyVectorized();
7763     }
7764     TargetTransformInfo::UnrollingPreferences UP;
7765     TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7766     if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7767       addRuntimeUnrollDisableMetaData(L);
7768   }
7769 
7770   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7771   //    predication, updating analyses.
7772   ILV.fixVectorizedLoop(State);
7773 
7774   ILV.printDebugTracesAtEnd();
7775 
7776   // 4. Adjust branch weight of the branch in the middle block.
7777   if (BestVPlan.getVectorLoopRegion()) {
7778     auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7779     auto *MiddleTerm =
7780         cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7781     if (MiddleTerm->isConditional() &&
7782         hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7783       // Assume that `Count % VectorTripCount` is equally distributed.
7784       unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7785       assert(TripCount > 0 && "trip count should not be zero");
7786       const uint32_t Weights[] = {1, TripCount - 1};
7787       setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7788     }
7789   }
7790 
7791   return State.ExpandedSCEVs;
7792 }
7793 
7794 //===--------------------------------------------------------------------===//
7795 // EpilogueVectorizerMainLoop
7796 //===--------------------------------------------------------------------===//
7797 
7798 /// This function is partially responsible for generating the control flow
7799 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7800 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7801     const SCEV2ValueTy &ExpandedSCEVs) {
7802   createVectorLoopSkeleton("");
7803 
7804   // Generate the code to check the minimum iteration count of the vector
7805   // epilogue (see below).
7806   EPI.EpilogueIterationCountCheck =
7807       emitIterationCountCheck(LoopScalarPreHeader, true);
7808   EPI.EpilogueIterationCountCheck->setName("iter.check");
7809 
7810   // Generate the code to check any assumptions that we've made for SCEV
7811   // expressions.
7812   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7813 
7814   // Generate the code that checks at runtime if arrays overlap. We put the
7815   // checks into a separate block to make the more common case of few elements
7816   // faster.
7817   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7818 
7819   // Generate the iteration count check for the main loop, *after* the check
7820   // for the epilogue loop, so that the path-length is shorter for the case
7821   // that goes directly through the vector epilogue. The longer-path length for
7822   // the main loop is compensated for, by the gain from vectorizing the larger
7823   // trip count. Note: the branch will get updated later on when we vectorize
7824   // the epilogue.
7825   EPI.MainLoopIterationCountCheck =
7826       emitIterationCountCheck(LoopScalarPreHeader, false);
7827 
7828   // Generate the induction variable.
7829   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7830 
7831   return LoopVectorPreHeader;
7832 }
7833 
7834 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7835   LLVM_DEBUG({
7836     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7837            << "Main Loop VF:" << EPI.MainLoopVF
7838            << ", Main Loop UF:" << EPI.MainLoopUF
7839            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7840            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7841   });
7842 }
7843 
7844 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7845   DEBUG_WITH_TYPE(VerboseDebug, {
7846     dbgs() << "intermediate fn:\n"
7847            << *OrigLoop->getHeader()->getParent() << "\n";
7848   });
7849 }
7850 
7851 BasicBlock *
7852 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7853                                                     bool ForEpilogue) {
7854   assert(Bypass && "Expected valid bypass basic block.");
7855   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7856   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7857   Value *Count = getTripCount();
7858   // Reuse existing vector loop preheader for TC checks.
7859   // Note that new preheader block is generated for vector loop.
7860   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7861   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7862 
7863   // Generate code to check if the loop's trip count is less than VF * UF of the
7864   // main vector loop.
7865   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7866                                                     : VF.isVector())
7867                ? ICmpInst::ICMP_ULE
7868                : ICmpInst::ICMP_ULT;
7869 
7870   Value *CheckMinIters = Builder.CreateICmp(
7871       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7872       "min.iters.check");
7873 
7874   if (!ForEpilogue)
7875     TCCheckBlock->setName("vector.main.loop.iter.check");
7876 
7877   // Create new preheader for vector loop.
7878   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7879                                    DT, LI, nullptr, "vector.ph");
7880 
7881   if (ForEpilogue) {
7882     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7883                                  DT->getNode(Bypass)->getIDom()) &&
7884            "TC check is expected to dominate Bypass");
7885 
7886     LoopBypassBlocks.push_back(TCCheckBlock);
7887 
7888     // Save the trip count so we don't have to regenerate it in the
7889     // vec.epilog.iter.check. This is safe to do because the trip count
7890     // generated here dominates the vector epilog iter check.
7891     EPI.TripCount = Count;
7892   }
7893 
7894   BranchInst &BI =
7895       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7896   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7897     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7898   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7899 
7900   introduceCheckBlockInVPlan(TCCheckBlock);
7901   return TCCheckBlock;
7902 }
7903 
7904 //===--------------------------------------------------------------------===//
7905 // EpilogueVectorizerEpilogueLoop
7906 //===--------------------------------------------------------------------===//
7907 
7908 /// This function is partially responsible for generating the control flow
7909 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7910 BasicBlock *
7911 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7912     const SCEV2ValueTy &ExpandedSCEVs) {
7913   createVectorLoopSkeleton("vec.epilog.");
7914 
7915   // Now, compare the remaining count and if there aren't enough iterations to
7916   // execute the vectorized epilogue skip to the scalar part.
7917   LoopVectorPreHeader->setName("vec.epilog.ph");
7918   BasicBlock *VecEpilogueIterationCountCheck =
7919       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
7920                  nullptr, "vec.epilog.iter.check", true);
7921   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7922                                           VecEpilogueIterationCountCheck);
7923   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7924 
7925   // Adjust the control flow taking the state info from the main loop
7926   // vectorization into account.
7927   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7928          "expected this to be saved from the previous pass.");
7929   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7930       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7931 
7932   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7933       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7934 
7935   if (EPI.SCEVSafetyCheck)
7936     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7937         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7938   if (EPI.MemSafetyCheck)
7939     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7940         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7941 
7942   DT->changeImmediateDominator(LoopScalarPreHeader,
7943                                EPI.EpilogueIterationCountCheck);
7944   // Keep track of bypass blocks, as they feed start values to the induction and
7945   // reduction phis in the scalar loop preheader.
7946   if (EPI.SCEVSafetyCheck)
7947     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7948   if (EPI.MemSafetyCheck)
7949     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7950   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7951 
7952   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7953   // reductions which merge control-flow from the latch block and the middle
7954   // block. Update the incoming values here and move the Phi into the preheader.
7955   SmallVector<PHINode *, 4> PhisInBlock;
7956   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7957     PhisInBlock.push_back(&Phi);
7958 
7959   for (PHINode *Phi : PhisInBlock) {
7960     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7961     Phi->replaceIncomingBlockWith(
7962         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7963         VecEpilogueIterationCountCheck);
7964 
7965     // If the phi doesn't have an incoming value from the
7966     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7967     // value and also those from other check blocks. This is needed for
7968     // reduction phis only.
7969     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7970           return EPI.EpilogueIterationCountCheck == IncB;
7971         }))
7972       continue;
7973     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7974     if (EPI.SCEVSafetyCheck)
7975       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7976     if (EPI.MemSafetyCheck)
7977       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7978   }
7979 
7980   // Generate bypass values from the additional bypass block. Note that when the
7981   // vectorized epilogue is skipped due to iteration count check, then the
7982   // resume value for the induction variable comes from the trip count of the
7983   // main vector loop, passed as the second argument.
7984   createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
7985   return LoopVectorPreHeader;
7986 }
7987 
7988 BasicBlock *
7989 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7990     BasicBlock *Bypass, BasicBlock *Insert) {
7991 
7992   assert(EPI.TripCount &&
7993          "Expected trip count to have been saved in the first pass.");
7994   assert(
7995       (!isa<Instruction>(EPI.TripCount) ||
7996        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7997       "saved trip count does not dominate insertion point.");
7998   Value *TC = EPI.TripCount;
7999   IRBuilder<> Builder(Insert->getTerminator());
8000   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8001 
8002   // Generate code to check if the loop's trip count is less than VF * UF of the
8003   // vector epilogue loop.
8004   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8005                ? ICmpInst::ICMP_ULE
8006                : ICmpInst::ICMP_ULT;
8007 
8008   Value *CheckMinIters =
8009       Builder.CreateICmp(P, Count,
8010                          createStepForVF(Builder, Count->getType(),
8011                                          EPI.EpilogueVF, EPI.EpilogueUF),
8012                          "min.epilog.iters.check");
8013 
8014   BranchInst &BI =
8015       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8016   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
8017     unsigned MainLoopStep = UF * VF.getKnownMinValue();
8018     unsigned EpilogueLoopStep =
8019         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
8020     // We assume the remaining `Count` is equally distributed in
8021     // [0, MainLoopStep)
8022     // So the probability for `Count < EpilogueLoopStep` should be
8023     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8024     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8025     const uint32_t Weights[] = {EstimatedSkipCount,
8026                                 MainLoopStep - EstimatedSkipCount};
8027     setBranchWeights(BI, Weights, /*IsExpected=*/false);
8028   }
8029   ReplaceInstWithInst(Insert->getTerminator(), &BI);
8030   LoopBypassBlocks.push_back(Insert);
8031 
8032   // A new entry block has been created for the epilogue VPlan. Hook it in, as
8033   // otherwise we would try to modify the entry to the main vector loop.
8034   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8035   VPBasicBlock *OldEntry = Plan.getEntry();
8036   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8037   Plan.setEntry(NewEntry);
8038   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8039 
8040   introduceCheckBlockInVPlan(Insert);
8041   return Insert;
8042 }
8043 
8044 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8045   LLVM_DEBUG({
8046     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8047            << "Epilogue Loop VF:" << EPI.EpilogueVF
8048            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8049   });
8050 }
8051 
8052 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8053   DEBUG_WITH_TYPE(VerboseDebug, {
8054     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8055   });
8056 }
8057 
8058 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8059 VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
8060   std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8061     return getVPValueOrAddLiveIn(Op);
8062   };
8063   return map_range(Operands, Fn);
8064 }
8065 
8066 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
8067   BasicBlock *Src = SI->getParent();
8068   assert(!OrigLoop->isLoopExiting(Src) &&
8069          all_of(successors(Src),
8070                 [this](BasicBlock *Succ) {
8071                   return OrigLoop->getHeader() != Succ;
8072                 }) &&
8073          "unsupported switch either exiting loop or continuing to header");
8074   // Create masks where the terminator in Src is a switch. We create mask for
8075   // all edges at the same time. This is more efficient, as we can create and
8076   // collect compares for all cases once.
8077   VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8078   BasicBlock *DefaultDst = SI->getDefaultDest();
8079   MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
8080   for (auto &C : SI->cases()) {
8081     BasicBlock *Dst = C.getCaseSuccessor();
8082     assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8083     // Cases whose destination is the same as default are redundant and can be
8084     // ignored - they will get there anyhow.
8085     if (Dst == DefaultDst)
8086       continue;
8087     auto &Compares = Dst2Compares[Dst];
8088     VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8089     Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8090   }
8091 
8092   // We need to handle 2 separate cases below for all entries in Dst2Compares,
8093   // which excludes destinations matching the default destination.
8094   VPValue *SrcMask = getBlockInMask(Src);
8095   VPValue *DefaultMask = nullptr;
8096   for (const auto &[Dst, Conds] : Dst2Compares) {
8097     // 1. Dst is not the default destination. Dst is reached if any of the cases
8098     // with destination == Dst are taken. Join the conditions for each case
8099     // whose destination == Dst using an OR.
8100     VPValue *Mask = Conds[0];
8101     for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8102       Mask = Builder.createOr(Mask, V);
8103     if (SrcMask)
8104       Mask = Builder.createLogicalAnd(SrcMask, Mask);
8105     EdgeMaskCache[{Src, Dst}] = Mask;
8106 
8107     // 2. Create the mask for the default destination, which is reached if none
8108     // of the cases with destination != default destination are taken. Join the
8109     // conditions for each case where the destination is != Dst using an OR and
8110     // negate it.
8111     DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8112   }
8113 
8114   if (DefaultMask) {
8115     DefaultMask = Builder.createNot(DefaultMask);
8116     if (SrcMask)
8117       DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8118   }
8119   EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8120 }
8121 
8122 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
8123   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8124 
8125   // Look for cached value.
8126   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8127   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8128   if (ECEntryIt != EdgeMaskCache.end())
8129     return ECEntryIt->second;
8130 
8131   if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8132     createSwitchEdgeMasks(SI);
8133     assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8134     return EdgeMaskCache[Edge];
8135   }
8136 
8137   VPValue *SrcMask = getBlockInMask(Src);
8138 
8139   // The terminator has to be a branch inst!
8140   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8141   assert(BI && "Unexpected terminator found");
8142   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8143     return EdgeMaskCache[Edge] = SrcMask;
8144 
8145   // If source is an exiting block, we know the exit edge is dynamically dead
8146   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8147   // adding uses of an otherwise potentially dead instruction unless we are
8148   // vectorizing a loop with uncountable exits. In that case, we always
8149   // materialize the mask.
8150   if (OrigLoop->isLoopExiting(Src) &&
8151       Src != Legal->getUncountableEarlyExitingBlock())
8152     return EdgeMaskCache[Edge] = SrcMask;
8153 
8154   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8155   assert(EdgeMask && "No Edge Mask found for condition");
8156 
8157   if (BI->getSuccessor(0) != Dst)
8158     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8159 
8160   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8161     // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8162     // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8163     // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8164     EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8165   }
8166 
8167   return EdgeMaskCache[Edge] = EdgeMask;
8168 }
8169 
8170 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
8171   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8172 
8173   // Look for cached value.
8174   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8175   EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8176   assert(ECEntryIt != EdgeMaskCache.end() &&
8177          "looking up mask for edge which has not been created");
8178   return ECEntryIt->second;
8179 }
8180 
8181 void VPRecipeBuilder::createHeaderMask() {
8182   BasicBlock *Header = OrigLoop->getHeader();
8183 
8184   // When not folding the tail, use nullptr to model all-true mask.
8185   if (!CM.foldTailByMasking()) {
8186     BlockMaskCache[Header] = nullptr;
8187     return;
8188   }
8189 
8190   // Introduce the early-exit compare IV <= BTC to form header block mask.
8191   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8192   // constructing the desired canonical IV in the header block as its first
8193   // non-phi instructions.
8194 
8195   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8196   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8197   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8198   HeaderVPBB->insert(IV, NewInsertionPoint);
8199 
8200   VPBuilder::InsertPointGuard Guard(Builder);
8201   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8202   VPValue *BlockMask = nullptr;
8203   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8204   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8205   BlockMaskCache[Header] = BlockMask;
8206 }
8207 
8208 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8209   // Return the cached value.
8210   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8211   assert(BCEntryIt != BlockMaskCache.end() &&
8212          "Trying to access mask for block without one.");
8213   return BCEntryIt->second;
8214 }
8215 
8216 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
8217   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8218   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8219   assert(OrigLoop->getHeader() != BB &&
8220          "Loop header must have cached block mask");
8221 
8222   // All-one mask is modelled as no-mask following the convention for masked
8223   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8224   VPValue *BlockMask = nullptr;
8225   // This is the block mask. We OR all unique incoming edges.
8226   for (auto *Predecessor :
8227        SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) {
8228     VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8229     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8230       BlockMaskCache[BB] = EdgeMask;
8231       return;
8232     }
8233 
8234     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8235       BlockMask = EdgeMask;
8236       continue;
8237     }
8238 
8239     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8240   }
8241 
8242   BlockMaskCache[BB] = BlockMask;
8243 }
8244 
8245 VPWidenMemoryRecipe *
8246 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8247                                   VFRange &Range) {
8248   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8249          "Must be called with either a load or store");
8250 
8251   auto WillWiden = [&](ElementCount VF) -> bool {
8252     LoopVectorizationCostModel::InstWidening Decision =
8253         CM.getWideningDecision(I, VF);
8254     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8255            "CM decision should be taken at this point.");
8256     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8257       return true;
8258     if (CM.isScalarAfterVectorization(I, VF) ||
8259         CM.isProfitableToScalarize(I, VF))
8260       return false;
8261     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8262   };
8263 
8264   if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
8265     return nullptr;
8266 
8267   VPValue *Mask = nullptr;
8268   if (Legal->isMaskRequired(I))
8269     Mask = getBlockInMask(I->getParent());
8270 
8271   // Determine if the pointer operand of the access is either consecutive or
8272   // reverse consecutive.
8273   LoopVectorizationCostModel::InstWidening Decision =
8274       CM.getWideningDecision(I, Range.Start);
8275   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8276   bool Consecutive =
8277       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8278 
8279   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8280   if (Consecutive) {
8281     auto *GEP = dyn_cast<GetElementPtrInst>(
8282         Ptr->getUnderlyingValue()->stripPointerCasts());
8283     VPSingleDefRecipe *VectorPtr;
8284     if (Reverse) {
8285       // When folding the tail, we may compute an address that we don't in the
8286       // original scalar loop and it may not be inbounds. Drop Inbounds in that
8287       // case.
8288       GEPNoWrapFlags Flags =
8289           (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8290               ? GEPNoWrapFlags::none()
8291               : GEPNoWrapFlags::inBounds();
8292       VectorPtr = new VPReverseVectorPointerRecipe(
8293           Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8294     } else {
8295       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8296                                             GEP ? GEP->getNoWrapFlags()
8297                                                 : GEPNoWrapFlags::none(),
8298                                             I->getDebugLoc());
8299     }
8300     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8301     Ptr = VectorPtr;
8302   }
8303   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8304     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8305                                  I->getDebugLoc());
8306 
8307   StoreInst *Store = cast<StoreInst>(I);
8308   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8309                                 Reverse, I->getDebugLoc());
8310 }
8311 
8312 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8313 /// insert a recipe to expand the step for the induction recipe.
8314 static VPWidenIntOrFpInductionRecipe *
8315 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8316                             VPValue *Start, const InductionDescriptor &IndDesc,
8317                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8318   assert(IndDesc.getStartValue() ==
8319          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8320   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8321          "step must be loop invariant");
8322 
8323   VPValue *Step =
8324       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8325   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8326     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8327                                              IndDesc, TruncI,
8328                                              TruncI->getDebugLoc());
8329   }
8330   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8331   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8332                                            IndDesc, Phi->getDebugLoc());
8333 }
8334 
8335 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8336     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
8337 
8338   // Check if this is an integer or fp induction. If so, build the recipe that
8339   // produces its scalar and vector values.
8340   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8341     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8342                                        *PSE.getSE(), *OrigLoop);
8343 
8344   // Check if this is pointer induction. If so, build the recipe for it.
8345   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8346     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8347                                                            *PSE.getSE());
8348     return new VPWidenPointerInductionRecipe(
8349         Phi, Operands[0], Step, *II,
8350         LoopVectorizationPlanner::getDecisionAndClampRange(
8351             [&](ElementCount VF) {
8352               return CM.isScalarAfterVectorization(Phi, VF);
8353             },
8354             Range),
8355         Phi->getDebugLoc());
8356   }
8357   return nullptr;
8358 }
8359 
8360 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8361     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
8362   // Optimize the special case where the source is a constant integer
8363   // induction variable. Notice that we can only optimize the 'trunc' case
8364   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8365   // (c) other casts depend on pointer size.
8366 
8367   // Determine whether \p K is a truncation based on an induction variable that
8368   // can be optimized.
8369   auto IsOptimizableIVTruncate =
8370       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8371     return [=](ElementCount VF) -> bool {
8372       return CM.isOptimizableIVTruncate(K, VF);
8373     };
8374   };
8375 
8376   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8377           IsOptimizableIVTruncate(I), Range)) {
8378 
8379     auto *Phi = cast<PHINode>(I->getOperand(0));
8380     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8381     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8382     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8383                                        *OrigLoop);
8384   }
8385   return nullptr;
8386 }
8387 
8388 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8389                                            ArrayRef<VPValue *> Operands) {
8390   unsigned NumIncoming = Phi->getNumIncomingValues();
8391 
8392   // We know that all PHIs in non-header blocks are converted into selects, so
8393   // we don't have to worry about the insertion order and we can just use the
8394   // builder. At this point we generate the predication tree. There may be
8395   // duplications since this is a simple recursive scan, but future
8396   // optimizations will clean it up.
8397   SmallVector<VPValue *, 2> OperandsWithMask;
8398 
8399   for (unsigned In = 0; In < NumIncoming; In++) {
8400     OperandsWithMask.push_back(Operands[In]);
8401     VPValue *EdgeMask =
8402         getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8403     if (!EdgeMask) {
8404       assert(In == 0 && "Both null and non-null edge masks found");
8405       assert(all_equal(Operands) &&
8406              "Distinct incoming values with one having a full mask");
8407       break;
8408     }
8409     OperandsWithMask.push_back(EdgeMask);
8410   }
8411   return new VPBlendRecipe(Phi, OperandsWithMask);
8412 }
8413 
8414 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8415                                                    ArrayRef<VPValue *> Operands,
8416                                                    VFRange &Range) {
8417   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8418       [this, CI](ElementCount VF) {
8419         return CM.isScalarWithPredication(CI, VF);
8420       },
8421       Range);
8422 
8423   if (IsPredicated)
8424     return nullptr;
8425 
8426   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8427   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8428              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8429              ID == Intrinsic::pseudoprobe ||
8430              ID == Intrinsic::experimental_noalias_scope_decl))
8431     return nullptr;
8432 
8433   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8434 
8435   // Is it beneficial to perform intrinsic call compared to lib call?
8436   bool ShouldUseVectorIntrinsic =
8437       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8438                 [&](ElementCount VF) -> bool {
8439                   return CM.getCallWideningDecision(CI, VF).Kind ==
8440                          LoopVectorizationCostModel::CM_IntrinsicCall;
8441                 },
8442                 Range);
8443   if (ShouldUseVectorIntrinsic)
8444     return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8445                                       CI->getDebugLoc());
8446 
8447   Function *Variant = nullptr;
8448   std::optional<unsigned> MaskPos;
8449   // Is better to call a vectorized version of the function than to to scalarize
8450   // the call?
8451   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8452       [&](ElementCount VF) -> bool {
8453         // The following case may be scalarized depending on the VF.
8454         // The flag shows whether we can use a usual Call for vectorized
8455         // version of the instruction.
8456 
8457         // If we've found a variant at a previous VF, then stop looking. A
8458         // vectorized variant of a function expects input in a certain shape
8459         // -- basically the number of input registers, the number of lanes
8460         // per register, and whether there's a mask required.
8461         // We store a pointer to the variant in the VPWidenCallRecipe, so
8462         // once we have an appropriate variant it's only valid for that VF.
8463         // This will force a different vplan to be generated for each VF that
8464         // finds a valid variant.
8465         if (Variant)
8466           return false;
8467         LoopVectorizationCostModel::CallWideningDecision Decision =
8468             CM.getCallWideningDecision(CI, VF);
8469         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8470           Variant = Decision.Variant;
8471           MaskPos = Decision.MaskPos;
8472           return true;
8473         }
8474 
8475         return false;
8476       },
8477       Range);
8478   if (ShouldUseVectorCall) {
8479     if (MaskPos.has_value()) {
8480       // We have 2 cases that would require a mask:
8481       //   1) The block needs to be predicated, either due to a conditional
8482       //      in the scalar loop or use of an active lane mask with
8483       //      tail-folding, and we use the appropriate mask for the block.
8484       //   2) No mask is required for the block, but the only available
8485       //      vector variant at this VF requires a mask, so we synthesize an
8486       //      all-true mask.
8487       VPValue *Mask = nullptr;
8488       if (Legal->isMaskRequired(CI))
8489         Mask = getBlockInMask(CI->getParent());
8490       else
8491         Mask = Plan.getOrAddLiveIn(
8492             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
8493 
8494       Ops.insert(Ops.begin() + *MaskPos, Mask);
8495     }
8496 
8497     Ops.push_back(Operands.back());
8498     return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8499   }
8500 
8501   return nullptr;
8502 }
8503 
8504 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8505   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8506          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8507   // Instruction should be widened, unless it is scalar after vectorization,
8508   // scalarization is profitable or it is predicated.
8509   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8510     return CM.isScalarAfterVectorization(I, VF) ||
8511            CM.isProfitableToScalarize(I, VF) ||
8512            CM.isScalarWithPredication(I, VF);
8513   };
8514   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8515                                                              Range);
8516 }
8517 
8518 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8519                                            ArrayRef<VPValue *> Operands,
8520                                            VPBasicBlock *VPBB) {
8521   switch (I->getOpcode()) {
8522   default:
8523     return nullptr;
8524   case Instruction::SDiv:
8525   case Instruction::UDiv:
8526   case Instruction::SRem:
8527   case Instruction::URem: {
8528     // If not provably safe, use a select to form a safe divisor before widening the
8529     // div/rem operation itself.  Otherwise fall through to general handling below.
8530     if (CM.isPredicatedInst(I)) {
8531       SmallVector<VPValue *> Ops(Operands);
8532       VPValue *Mask = getBlockInMask(I->getParent());
8533       VPValue *One =
8534           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8535       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8536       Ops[1] = SafeRHS;
8537       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8538     }
8539     [[fallthrough]];
8540   }
8541   case Instruction::Add:
8542   case Instruction::And:
8543   case Instruction::AShr:
8544   case Instruction::FAdd:
8545   case Instruction::FCmp:
8546   case Instruction::FDiv:
8547   case Instruction::FMul:
8548   case Instruction::FNeg:
8549   case Instruction::FRem:
8550   case Instruction::FSub:
8551   case Instruction::ICmp:
8552   case Instruction::LShr:
8553   case Instruction::Mul:
8554   case Instruction::Or:
8555   case Instruction::Select:
8556   case Instruction::Shl:
8557   case Instruction::Sub:
8558   case Instruction::Xor:
8559   case Instruction::Freeze:
8560     SmallVector<VPValue *> NewOps(Operands);
8561     if (Instruction::isBinaryOp(I->getOpcode())) {
8562       // The legacy cost model uses SCEV to check if some of the operands are
8563       // constants. To match the legacy cost model's behavior, use SCEV to try
8564       // to replace operands with constants.
8565       ScalarEvolution &SE = *PSE.getSE();
8566       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8567         Value *V = Op->getUnderlyingValue();
8568         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8569           return Op;
8570         auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8571         if (!C)
8572           return Op;
8573         return Plan.getOrAddLiveIn(C->getValue());
8574       };
8575       // For Mul, the legacy cost model checks both operands.
8576       if (I->getOpcode() == Instruction::Mul)
8577         NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8578       // For other binops, the legacy cost model only checks the second operand.
8579       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8580     }
8581     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8582   };
8583 }
8584 
8585 VPHistogramRecipe *
8586 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8587                                      ArrayRef<VPValue *> Operands) {
8588   // FIXME: Support other operations.
8589   unsigned Opcode = HI->Update->getOpcode();
8590   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8591          "Histogram update operation must be an Add or Sub");
8592 
8593   SmallVector<VPValue *, 3> HGramOps;
8594   // Bucket address.
8595   HGramOps.push_back(Operands[1]);
8596   // Increment value.
8597   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8598 
8599   // In case of predicated execution (due to tail-folding, or conditional
8600   // execution, or both), pass the relevant mask.
8601   if (Legal->isMaskRequired(HI->Store))
8602     HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8603 
8604   return new VPHistogramRecipe(Opcode,
8605                                make_range(HGramOps.begin(), HGramOps.end()),
8606                                HI->Store->getDebugLoc());
8607 }
8608 
8609 void VPRecipeBuilder::fixHeaderPhis() {
8610   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8611   for (VPHeaderPHIRecipe *R : PhisToFix) {
8612     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8613     VPRecipeBase *IncR =
8614         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8615     R->addOperand(IncR->getVPSingleValue());
8616   }
8617 }
8618 
8619 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8620                                                       VFRange &Range) {
8621   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8622       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8623       Range);
8624 
8625   bool IsPredicated = CM.isPredicatedInst(I);
8626 
8627   // Even if the instruction is not marked as uniform, there are certain
8628   // intrinsic calls that can be effectively treated as such, so we check for
8629   // them here. Conservatively, we only do this for scalable vectors, since
8630   // for fixed-width VFs we can always fall back on full scalarization.
8631   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8632     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8633     case Intrinsic::assume:
8634     case Intrinsic::lifetime_start:
8635     case Intrinsic::lifetime_end:
8636       // For scalable vectors if one of the operands is variant then we still
8637       // want to mark as uniform, which will generate one instruction for just
8638       // the first lane of the vector. We can't scalarize the call in the same
8639       // way as for fixed-width vectors because we don't know how many lanes
8640       // there are.
8641       //
8642       // The reasons for doing it this way for scalable vectors are:
8643       //   1. For the assume intrinsic generating the instruction for the first
8644       //      lane is still be better than not generating any at all. For
8645       //      example, the input may be a splat across all lanes.
8646       //   2. For the lifetime start/end intrinsics the pointer operand only
8647       //      does anything useful when the input comes from a stack object,
8648       //      which suggests it should always be uniform. For non-stack objects
8649       //      the effect is to poison the object, which still allows us to
8650       //      remove the call.
8651       IsUniform = true;
8652       break;
8653     default:
8654       break;
8655     }
8656   }
8657   VPValue *BlockInMask = nullptr;
8658   if (!IsPredicated) {
8659     // Finalize the recipe for Instr, first if it is not predicated.
8660     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8661   } else {
8662     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8663     // Instructions marked for predication are replicated and a mask operand is
8664     // added initially. Masked replicate recipes will later be placed under an
8665     // if-then construct to prevent side-effects. Generate recipes to compute
8666     // the block mask for this region.
8667     BlockInMask = getBlockInMask(I->getParent());
8668   }
8669 
8670   // Note that there is some custom logic to mark some intrinsics as uniform
8671   // manually above for scalable vectors, which this assert needs to account for
8672   // as well.
8673   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8674           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8675          "Should not predicate a uniform recipe");
8676   auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8677                                        IsUniform, BlockInMask);
8678   return Recipe;
8679 }
8680 
8681 /// Find all possible partial reductions in the loop and track all of those that
8682 /// are valid so recipes can be formed later.
8683 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8684   // Find all possible partial reductions.
8685   SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
8686       PartialReductionChains;
8687   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
8688     if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8689             getScaledReduction(Phi, RdxDesc, Range))
8690       PartialReductionChains.push_back(*Pair);
8691 
8692   // A partial reduction is invalid if any of its extends are used by
8693   // something that isn't another partial reduction. This is because the
8694   // extends are intended to be lowered along with the reduction itself.
8695 
8696   // Build up a set of partial reduction bin ops for efficient use checking.
8697   SmallSet<User *, 4> PartialReductionBinOps;
8698   for (const auto &[PartialRdx, _] : PartialReductionChains)
8699     PartialReductionBinOps.insert(PartialRdx.BinOp);
8700 
8701   auto ExtendIsOnlyUsedByPartialReductions =
8702       [&PartialReductionBinOps](Instruction *Extend) {
8703         return all_of(Extend->users(), [&](const User *U) {
8704           return PartialReductionBinOps.contains(U);
8705         });
8706       };
8707 
8708   // Check if each use of a chain's two extends is a partial reduction
8709   // and only add those that don't have non-partial reduction users.
8710   for (auto Pair : PartialReductionChains) {
8711     PartialReductionChain Chain = Pair.first;
8712     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8713         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8714       ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));
8715   }
8716 }
8717 
8718 std::optional<std::pair<PartialReductionChain, unsigned>>
8719 VPRecipeBuilder::getScaledReduction(PHINode *PHI,
8720                                     const RecurrenceDescriptor &Rdx,
8721                                     VFRange &Range) {
8722   // TODO: Allow scaling reductions when predicating. The select at
8723   // the end of the loop chooses between the phi value and most recent
8724   // reduction result, both of which have different VFs to the active lane
8725   // mask when scaling.
8726   if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
8727     return std::nullopt;
8728 
8729   auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
8730   if (!Update)
8731     return std::nullopt;
8732 
8733   Value *Op = Update->getOperand(0);
8734   Value *PhiOp = Update->getOperand(1);
8735   if (Op == PHI) {
8736     Op = Update->getOperand(1);
8737     PhiOp = Update->getOperand(0);
8738   }
8739   if (PhiOp != PHI)
8740     return std::nullopt;
8741 
8742   auto *BinOp = dyn_cast<BinaryOperator>(Op);
8743   if (!BinOp || !BinOp->hasOneUse())
8744     return std::nullopt;
8745 
8746   using namespace llvm::PatternMatch;
8747   Value *A, *B;
8748   if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8749       !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8750     return std::nullopt;
8751 
8752   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8753   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8754 
8755   TTI::PartialReductionExtendKind OpAExtend =
8756       TargetTransformInfo::getPartialReductionExtendKind(ExtA);
8757   TTI::PartialReductionExtendKind OpBExtend =
8758       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
8759 
8760   PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
8761 
8762   unsigned TargetScaleFactor =
8763       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8764           A->getType()->getPrimitiveSizeInBits());
8765 
8766   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8767           [&](ElementCount VF) {
8768             InstructionCost Cost = TTI->getPartialReductionCost(
8769                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8770                 VF, OpAExtend, OpBExtend,
8771                 std::make_optional(BinOp->getOpcode()));
8772             return Cost.isValid();
8773           },
8774           Range))
8775     return std::make_pair(Chain, TargetScaleFactor);
8776 
8777   return std::nullopt;
8778 }
8779 
8780 VPRecipeBase *
8781 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8782                                         ArrayRef<VPValue *> Operands,
8783                                         VFRange &Range, VPBasicBlock *VPBB) {
8784   // First, check for specific widening recipes that deal with inductions, Phi
8785   // nodes, calls and memory operations.
8786   VPRecipeBase *Recipe;
8787   if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8788     if (Phi->getParent() != OrigLoop->getHeader())
8789       return tryToBlend(Phi, Operands);
8790 
8791     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8792       return Recipe;
8793 
8794     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8795     assert((Legal->isReductionVariable(Phi) ||
8796             Legal->isFixedOrderRecurrence(Phi)) &&
8797            "can only widen reductions and fixed-order recurrences here");
8798     VPValue *StartV = Operands[0];
8799     if (Legal->isReductionVariable(Phi)) {
8800       const RecurrenceDescriptor &RdxDesc =
8801           Legal->getReductionVars().find(Phi)->second;
8802       assert(RdxDesc.getRecurrenceStartValue() ==
8803              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8804 
8805       // If the PHI is used by a partial reduction, set the scale factor.
8806       unsigned ScaleFactor =
8807           getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8808       PhiRecipe = new VPReductionPHIRecipe(
8809           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8810           CM.useOrderedReductions(RdxDesc), ScaleFactor);
8811     } else {
8812       // TODO: Currently fixed-order recurrences are modeled as chains of
8813       // first-order recurrences. If there are no users of the intermediate
8814       // recurrences in the chain, the fixed order recurrence should be modeled
8815       // directly, enabling more efficient codegen.
8816       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8817     }
8818 
8819     PhisToFix.push_back(PhiRecipe);
8820     return PhiRecipe;
8821   }
8822 
8823   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8824                                     cast<TruncInst>(Instr), Operands, Range)))
8825     return Recipe;
8826 
8827   // All widen recipes below deal only with VF > 1.
8828   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8829           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8830     return nullptr;
8831 
8832   if (auto *CI = dyn_cast<CallInst>(Instr))
8833     return tryToWidenCall(CI, Operands, Range);
8834 
8835   if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8836     if (auto HistInfo = Legal->getHistogramInfo(SI))
8837       return tryToWidenHistogram(*HistInfo, Operands);
8838 
8839   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8840     return tryToWidenMemory(Instr, Operands, Range);
8841 
8842   if (getScalingForReduction(Instr))
8843     return tryToCreatePartialReduction(Instr, Operands);
8844 
8845   if (!shouldWiden(Instr, Range))
8846     return nullptr;
8847 
8848   if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8849     return new VPWidenGEPRecipe(GEP,
8850                                 make_range(Operands.begin(), Operands.end()));
8851 
8852   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8853     return new VPWidenSelectRecipe(
8854         *SI, make_range(Operands.begin(), Operands.end()));
8855   }
8856 
8857   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8858     return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8859                                  *CI);
8860   }
8861 
8862   return tryToWiden(Instr, Operands, VPBB);
8863 }
8864 
8865 VPRecipeBase *
8866 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8867                                              ArrayRef<VPValue *> Operands) {
8868   assert(Operands.size() == 2 &&
8869          "Unexpected number of operands for partial reduction");
8870 
8871   VPValue *BinOp = Operands[0];
8872   VPValue *Phi = Operands[1];
8873   if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
8874     std::swap(BinOp, Phi);
8875 
8876   return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
8877                                       Reduction);
8878 }
8879 
8880 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8881                                                         ElementCount MaxVF) {
8882   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8883 
8884   auto MaxVFTimes2 = MaxVF * 2;
8885   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8886     VFRange SubRange = {VF, MaxVFTimes2};
8887     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8888       // Now optimize the initial VPlan.
8889       if (!Plan->hasVF(ElementCount::getFixed(1)))
8890         VPlanTransforms::truncateToMinimalBitwidths(*Plan,
8891                                                     CM.getMinimalBitwidths());
8892       VPlanTransforms::optimize(*Plan);
8893       // TODO: try to put it close to addActiveLaneMask().
8894       // Discard the plan if it is not EVL-compatible
8895       if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
8896                                       *Plan, CM.getMaxSafeElements()))
8897         break;
8898       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8899       VPlans.push_back(std::move(Plan));
8900     }
8901     VF = SubRange.End;
8902   }
8903 }
8904 
8905 // Add the necessary canonical IV and branch recipes required to control the
8906 // loop.
8907 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8908                                   DebugLoc DL) {
8909   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8910   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8911 
8912   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8913   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8914   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8915   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8916   Header->insert(CanonicalIVPHI, Header->begin());
8917 
8918   VPBuilder Builder(TopRegion->getExitingBasicBlock());
8919   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8920   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8921       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8922       "index.next");
8923   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8924 
8925   // Add the BranchOnCount VPInstruction to the latch.
8926   Builder.createNaryOp(VPInstruction::BranchOnCount,
8927                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8928 }
8929 
8930 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8931 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8932 /// the end value of the induction.
8933 static VPInstruction *addResumePhiRecipeForInduction(
8934     VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8935     VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8936   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8937   // Truncated wide inductions resume from the last lane of their vector value
8938   // in the last vector iteration which is handled elsewhere.
8939   if (WideIntOrFp && WideIntOrFp->getTruncInst())
8940     return nullptr;
8941 
8942   VPValue *Start = WideIV->getStartValue();
8943   VPValue *Step = WideIV->getStepValue();
8944   const InductionDescriptor &ID = WideIV->getInductionDescriptor();
8945   VPValue *EndValue = VectorTC;
8946   if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8947     EndValue = VectorPHBuilder.createDerivedIV(
8948         ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
8949         Start, VectorTC, Step);
8950   }
8951 
8952   // EndValue is derived from the vector trip count (which has the same type as
8953   // the widest induction) and thus may be wider than the induction here.
8954   Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
8955   if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
8956     EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
8957                                                 ScalarTypeOfWideIV,
8958                                                 WideIV->getDebugLoc());
8959   }
8960 
8961   auto *ResumePhiRecipe =
8962       ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
8963                                    WideIV->getDebugLoc(), "bc.resume.val");
8964   return ResumePhiRecipe;
8965 }
8966 
8967 /// Create resume phis in the scalar preheader for first-order recurrences,
8968 /// reductions and inductions, and update the VPIRInstructions wrapping the
8969 /// original phis in the scalar header. End values for inductions are added to
8970 /// \p IVEndValues.
8971 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8972                                 DenseMap<VPValue *, VPValue *> &IVEndValues) {
8973   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8974   auto *ScalarPH = Plan.getScalarPreheader();
8975   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
8976   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8977   VPBuilder VectorPHBuilder(
8978       cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
8979   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8980   VPBuilder ScalarPHBuilder(ScalarPH);
8981   VPValue *OneVPV = Plan.getOrAddLiveIn(
8982       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8983   for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
8984     auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
8985     auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
8986     if (!ScalarPhiI)
8987       break;
8988 
8989     // TODO: Extract final value from induction recipe initially, optimize to
8990     // pre-computed end value together in optimizeInductionExitUsers.
8991     auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
8992     if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
8993       if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
8994               WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8995               &Plan.getVectorTripCount())) {
8996         assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
8997                "Expected a ResumePhi");
8998         IVEndValues[WideIVR] = ResumePhi->getOperand(0);
8999         ScalarPhiIRI->addOperand(ResumePhi);
9000         continue;
9001       }
9002       // TODO: Also handle truncated inductions here. Computing end-values
9003       // separately should be done as VPlan-to-VPlan optimization, after
9004       // legalizing all resume values to use the last lane from the loop.
9005       assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9006              "should only skip truncated wide inductions");
9007       continue;
9008     }
9009 
9010     // The backedge value provides the value to resume coming out of a loop,
9011     // which for FORs is a vector whose last element needs to be extracted. The
9012     // start value provides the value if the loop is bypassed.
9013     bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9014     auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9015     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9016            "Cannot handle loops with uncountable early exits");
9017     if (IsFOR)
9018       ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9019           VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9020           "vector.recur.extract");
9021     StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9022     auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9023         VPInstruction::ResumePhi,
9024         {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9025     ScalarPhiIRI->addOperand(ResumePhiR);
9026   }
9027 }
9028 
9029 // Collect VPIRInstructions for phis in the exit blocks that are modeled
9030 // in VPlan and add the exiting VPValue as operand.
9031 static SetVector<VPIRInstruction *>
9032 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
9033                          VPlan &Plan) {
9034   auto *MiddleVPBB = Plan.getMiddleBlock();
9035   SetVector<VPIRInstruction *> ExitUsersToFix;
9036   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9037     for (VPRecipeBase &R : *ExitVPBB) {
9038       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9039       if (!ExitIRI)
9040         continue;
9041       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9042       if (!ExitPhi)
9043         break;
9044       for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
9045         BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9046         if (PredVPBB != MiddleVPBB) {
9047           SmallVector<BasicBlock *> ExitingBlocks;
9048           OrigLoop->getExitingBlocks(ExitingBlocks);
9049           assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
9050           ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
9051                                                     : ExitingBlocks[0];
9052         }
9053         Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9054         VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9055         ExitUsersToFix.insert(ExitIRI);
9056         ExitIRI->addOperand(V);
9057       }
9058     }
9059   }
9060   return ExitUsersToFix;
9061 }
9062 
9063 // Add exit values to \p Plan. Extracts are added for each entry in \p
9064 // ExitUsersToFix if needed and their operands are updated. Returns true if all
9065 // exit users can be handled, otherwise return false.
9066 static bool
9067 addUsersInExitBlocks(VPlan &Plan,
9068                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9069   if (ExitUsersToFix.empty())
9070     return true;
9071 
9072   auto *MiddleVPBB = Plan.getMiddleBlock();
9073   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9074   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9075 
9076   // Introduce extract for exiting values and update the VPIRInstructions
9077   // modeling the corresponding LCSSA phis.
9078   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9079     for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
9080       // Pass live-in values used by exit phis directly through to their users
9081       // in the exit block.
9082       if (Op->isLiveIn())
9083         continue;
9084 
9085       // Currently only live-ins can be used by exit values from blocks not
9086       // exiting via the vector latch through to the middle block.
9087       if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9088         return false;
9089 
9090       LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
9091       VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
9092                                     {Op, Plan.getOrAddLiveIn(ConstantInt::get(
9093                                              IntegerType::get(Ctx, 32), 1))});
9094       ExitIRI->setOperand(Idx, Ext);
9095     }
9096   }
9097   return true;
9098 }
9099 
9100 /// Handle users in the exit block for first order reductions in the original
9101 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9102 /// users in the original exit block using the VPIRInstruction wrapping to the
9103 /// LCSSA phi.
9104 static void addExitUsersForFirstOrderRecurrences(
9105     VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9106   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9107   auto *ScalarPHVPBB = Plan.getScalarPreheader();
9108   auto *MiddleVPBB = Plan.getMiddleBlock();
9109   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9110   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9111   VPValue *TwoVPV = Plan.getOrAddLiveIn(
9112       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9113 
9114   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9115     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9116     if (!FOR)
9117       continue;
9118 
9119     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9120            "Cannot handle loops with uncountable early exits");
9121 
9122     // This is the second phase of vectorizing first-order recurrences, creating
9123     // extract for users outside the loop. An overview of the transformation is
9124     // described below. Suppose we have the following loop with some use after
9125     // the loop of the last a[i-1],
9126     //
9127     //   for (int i = 0; i < n; ++i) {
9128     //     t = a[i - 1];
9129     //     b[i] = a[i] - t;
9130     //   }
9131     //   use t;
9132     //
9133     // There is a first-order recurrence on "a". For this loop, the shorthand
9134     // scalar IR looks like:
9135     //
9136     //   scalar.ph:
9137     //     s.init = a[-1]
9138     //     br scalar.body
9139     //
9140     //   scalar.body:
9141     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9142     //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9143     //     s2 = a[i]
9144     //     b[i] = s2 - s1
9145     //     br cond, scalar.body, exit.block
9146     //
9147     //   exit.block:
9148     //     use = lcssa.phi [s1, scalar.body]
9149     //
9150     // In this example, s1 is a recurrence because it's value depends on the
9151     // previous iteration. In the first phase of vectorization, we created a
9152     // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9153     // for users in the scalar preheader and exit block.
9154     //
9155     //   vector.ph:
9156     //     v_init = vector(..., ..., ..., a[-1])
9157     //     br vector.body
9158     //
9159     //   vector.body
9160     //     i = phi [0, vector.ph], [i+4, vector.body]
9161     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
9162     //     v2 = a[i, i+1, i+2, i+3]
9163     //     b[i] = v2 - v1
9164     //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9165     //     b[i, i+1, i+2, i+3] = v2 - v1
9166     //     br cond, vector.body, middle.block
9167     //
9168     //   middle.block:
9169     //     vector.recur.extract.for.phi = v2(2)
9170     //     vector.recur.extract = v2(3)
9171     //     br cond, scalar.ph, exit.block
9172     //
9173     //   scalar.ph:
9174     //     scalar.recur.init = phi [vector.recur.extract, middle.block],
9175     //                             [s.init, otherwise]
9176     //     br scalar.body
9177     //
9178     //   scalar.body:
9179     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9180     //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9181     //     s2 = a[i]
9182     //     b[i] = s2 - s1
9183     //     br cond, scalar.body, exit.block
9184     //
9185     //   exit.block:
9186     //     lo = lcssa.phi [s1, scalar.body],
9187     //                    [vector.recur.extract.for.phi, middle.block]
9188     //
9189     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9190     // Extract the penultimate value of the recurrence and use it as operand for
9191     // the VPIRInstruction modeling the phi.
9192     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9193       if (ExitIRI->getOperand(0) != FOR)
9194         continue;
9195       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9196           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9197           "vector.recur.extract.for.phi");
9198       ExitIRI->setOperand(0, PenultimateElement);
9199       ExitUsersToFix.remove(ExitIRI);
9200     }
9201   }
9202 }
9203 
9204 VPlanPtr
9205 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9206 
9207   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9208 
9209   // ---------------------------------------------------------------------------
9210   // Build initial VPlan: Scan the body of the loop in a topological order to
9211   // visit each basic block after having visited its predecessor basic blocks.
9212   // ---------------------------------------------------------------------------
9213 
9214   // Create initial VPlan skeleton, having a basic block for the pre-header
9215   // which contains SCEV expansions that need to happen before the CFG is
9216   // modified; a basic block for the vector pre-header, followed by a region for
9217   // the vector loop, followed by the middle basic block. The skeleton vector
9218   // loop region contains a header and latch basic blocks.
9219 
9220   bool RequiresScalarEpilogueCheck =
9221       LoopVectorizationPlanner::getDecisionAndClampRange(
9222           [this](ElementCount VF) {
9223             return !CM.requiresScalarEpilogue(VF.isVector());
9224           },
9225           Range);
9226   VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(),
9227                                             PSE, RequiresScalarEpilogueCheck,
9228                                             CM.foldTailByMasking(), OrigLoop);
9229 
9230   // Don't use getDecisionAndClampRange here, because we don't know the UF
9231   // so this function is better to be conservative, rather than to split
9232   // it up into different VPlans.
9233   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9234   bool IVUpdateMayOverflow = false;
9235   for (ElementCount VF : Range)
9236     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9237 
9238   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9239   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9240   // Use NUW for the induction increment if we proved that it won't overflow in
9241   // the vector loop or when not folding the tail. In the later case, we know
9242   // that the canonical induction increment will not overflow as the vector trip
9243   // count is >= increment and a multiple of the increment.
9244   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9245   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9246 
9247   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9248                                 Builder);
9249 
9250   // ---------------------------------------------------------------------------
9251   // Pre-construction: record ingredients whose recipes we'll need to further
9252   // process after constructing the initial VPlan.
9253   // ---------------------------------------------------------------------------
9254 
9255   // For each interleave group which is relevant for this (possibly trimmed)
9256   // Range, add it to the set of groups to be later applied to the VPlan and add
9257   // placeholders for its members' Recipes which we'll be replacing with a
9258   // single VPInterleaveRecipe.
9259   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9260     auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9261       bool Result = (VF.isVector() && // Query is illegal for VF == 1
9262                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
9263                          LoopVectorizationCostModel::CM_Interleave);
9264       // For scalable vectors, the only interleave factor currently supported
9265       // is 2 since we require the (de)interleave2 intrinsics instead of
9266       // shufflevectors.
9267       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9268              "Unsupported interleave factor for scalable vectors");
9269       return Result;
9270     };
9271     if (!getDecisionAndClampRange(ApplyIG, Range))
9272       continue;
9273     InterleaveGroups.insert(IG);
9274   }
9275 
9276   // ---------------------------------------------------------------------------
9277   // Construct recipes for the instructions in the loop
9278   // ---------------------------------------------------------------------------
9279 
9280   // Scan the body of the loop in a topological order to visit each basic block
9281   // after having visited its predecessor basic blocks.
9282   LoopBlocksDFS DFS(OrigLoop);
9283   DFS.perform(LI);
9284 
9285   VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9286   VPBasicBlock *VPBB = HeaderVPBB;
9287   BasicBlock *HeaderBB = OrigLoop->getHeader();
9288   bool NeedsMasks =
9289       CM.foldTailByMasking() ||
9290       any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9291         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9292         return Legal->blockNeedsPredication(BB) || NeedsBlends;
9293       });
9294 
9295   RecipeBuilder.collectScaledReductions(Range);
9296 
9297   auto *MiddleVPBB = Plan->getMiddleBlock();
9298   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9299   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9300     // Relevant instructions from basic block BB will be grouped into VPRecipe
9301     // ingredients and fill a new VPBasicBlock.
9302     if (VPBB != HeaderVPBB)
9303       VPBB->setName(BB->getName());
9304     Builder.setInsertPoint(VPBB);
9305 
9306     if (VPBB == HeaderVPBB)
9307       RecipeBuilder.createHeaderMask();
9308     else if (NeedsMasks)
9309       RecipeBuilder.createBlockInMask(BB);
9310 
9311     // Introduce each ingredient into VPlan.
9312     // TODO: Model and preserve debug intrinsics in VPlan.
9313     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9314       Instruction *Instr = &I;
9315       SmallVector<VPValue *, 4> Operands;
9316       auto *Phi = dyn_cast<PHINode>(Instr);
9317       if (Phi && Phi->getParent() == HeaderBB) {
9318         Operands.push_back(Plan->getOrAddLiveIn(
9319             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9320       } else {
9321         auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9322         Operands = {OpRange.begin(), OpRange.end()};
9323       }
9324 
9325       // The stores with invariant address inside the loop will be deleted, and
9326       // in the exit block, a uniform store recipe will be created for the final
9327       // invariant store of the reduction.
9328       StoreInst *SI;
9329       if ((SI = dyn_cast<StoreInst>(&I)) &&
9330           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9331         // Only create recipe for the final invariant store of the reduction.
9332         if (!Legal->isInvariantStoreOfReduction(SI))
9333           continue;
9334         auto *Recipe = new VPReplicateRecipe(
9335             SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9336             true /* IsUniform */);
9337         Recipe->insertBefore(*MiddleVPBB, MBIP);
9338         continue;
9339       }
9340 
9341       VPRecipeBase *Recipe =
9342           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9343       if (!Recipe)
9344         Recipe = RecipeBuilder.handleReplication(Instr, Range);
9345 
9346       RecipeBuilder.setRecipe(Instr, Recipe);
9347       if (isa<VPHeaderPHIRecipe>(Recipe)) {
9348         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9349         // the following cases, VPHeaderPHIRecipes may be created after non-phi
9350         // recipes and need to be moved to the phi section of HeaderVPBB:
9351         // * tail-folding (non-phi recipes computing the header mask are
9352         // introduced earlier than regular header phi recipes, and should appear
9353         // after them)
9354         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9355 
9356         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9357                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9358                "unexpected recipe needs moving");
9359         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9360       } else
9361         VPBB->appendRecipe(Recipe);
9362     }
9363 
9364     VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9365     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9366   }
9367 
9368   // After here, VPBB should not be used.
9369   VPBB = nullptr;
9370 
9371   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9372          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9373          "entry block must be set to a VPRegionBlock having a non-empty entry "
9374          "VPBasicBlock");
9375   RecipeBuilder.fixHeaderPhis();
9376 
9377   // Update wide induction increments to use the same step as the corresponding
9378   // wide induction. This enables detecting induction increments directly in
9379   // VPlan and removes redundant splats.
9380   for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9381     auto *IVInc = cast<Instruction>(
9382         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9383     if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9384       continue;
9385     VPWidenInductionRecipe *WideIV =
9386         cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9387     VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9388     R->setOperand(1, WideIV->getStepValue());
9389   }
9390 
9391   if (auto *UncountableExitingBlock =
9392           Legal->getUncountableEarlyExitingBlock()) {
9393     VPlanTransforms::handleUncountableEarlyExit(
9394         *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9395   }
9396   DenseMap<VPValue *, VPValue *> IVEndValues;
9397   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9398   SetVector<VPIRInstruction *> ExitUsersToFix =
9399       collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9400   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9401   if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9402     reportVectorizationFailure(
9403         "Some exit values in loop with uncountable exit not supported yet",
9404         "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9405     return nullptr;
9406   }
9407 
9408   // ---------------------------------------------------------------------------
9409   // Transform initial VPlan: Apply previously taken decisions, in order, to
9410   // bring the VPlan to its final state.
9411   // ---------------------------------------------------------------------------
9412 
9413   // Adjust the recipes for any inloop reductions.
9414   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9415 
9416   // Interleave memory: for each Interleave Group we marked earlier as relevant
9417   // for this VPlan, replace the Recipes widening its memory instructions with a
9418   // single VPInterleaveRecipe at its insertion point.
9419   VPlanTransforms::createInterleaveGroups(
9420       *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
9421 
9422   for (ElementCount VF : Range)
9423     Plan->addVF(VF);
9424   Plan->setName("Initial VPlan");
9425 
9426   // Replace VPValues for known constant strides guaranteed by predicate scalar
9427   // evolution.
9428   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9429     auto *R = cast<VPRecipeBase>(&U);
9430     return R->getParent()->getParent() ||
9431            R->getParent() ==
9432                Plan->getVectorLoopRegion()->getSinglePredecessor();
9433   };
9434   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9435     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9436     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9437     // Only handle constant strides for now.
9438     if (!ScevStride)
9439       continue;
9440 
9441     auto *CI = Plan->getOrAddLiveIn(
9442         ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9443     if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9444       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9445 
9446     // The versioned value may not be used in the loop directly but through a
9447     // sext/zext. Add new live-ins in those cases.
9448     for (Value *U : StrideV->users()) {
9449       if (!isa<SExtInst, ZExtInst>(U))
9450         continue;
9451       VPValue *StrideVPV = Plan->getLiveIn(U);
9452       if (!StrideVPV)
9453         continue;
9454       unsigned BW = U->getType()->getScalarSizeInBits();
9455       APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9456                                  : ScevStride->getAPInt().zext(BW);
9457       VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9458       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9459     }
9460   }
9461 
9462   VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
9463     return Legal->blockNeedsPredication(BB);
9464   });
9465 
9466   // Sink users of fixed-order recurrence past the recipe defining the previous
9467   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9468   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
9469     return nullptr;
9470 
9471   if (useActiveLaneMask(Style)) {
9472     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9473     // TailFoldingStyle is visible there.
9474     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9475     bool WithoutRuntimeCheck =
9476         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
9477     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9478                                        WithoutRuntimeCheck);
9479   }
9480   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
9481 
9482   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9483   return Plan;
9484 }
9485 
9486 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9487   // Outer loop handling: They may require CFG and instruction level
9488   // transformations before even evaluating whether vectorization is profitable.
9489   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9490   // the vectorization pipeline.
9491   assert(!OrigLoop->isInnermost());
9492   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9493 
9494   // Create new empty VPlan
9495   auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9496                                         true, false, OrigLoop);
9497 
9498   // Build hierarchical CFG
9499   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9500   HCFGBuilder.buildHierarchicalCFG();
9501 
9502   for (ElementCount VF : Range)
9503     Plan->addVF(VF);
9504 
9505   VPlanTransforms::VPInstructionsToVPRecipes(
9506       Plan,
9507       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9508       *PSE.getSE(), *TLI);
9509 
9510   // Remove the existing terminator of the exiting block of the top-most region.
9511   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9512   auto *Term =
9513       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9514   Term->eraseFromParent();
9515 
9516   // Tail folding is not supported for outer loops, so the induction increment
9517   // is guaranteed to not wrap.
9518   bool HasNUW = true;
9519   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9520                         DebugLoc());
9521 
9522   // Collect mapping of IR header phis to header phi recipes, to be used in
9523   // addScalarResumePhis.
9524   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9525                                 Builder);
9526   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9527     if (isa<VPCanonicalIVPHIRecipe>(&R))
9528       continue;
9529     auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9530     RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9531   }
9532   DenseMap<VPValue *, VPValue *> IVEndValues;
9533   // TODO: IVEndValues are not used yet in the native path, to optimize exit
9534   // values.
9535   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9536 
9537   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9538   return Plan;
9539 }
9540 
9541 // Adjust the recipes for reductions. For in-loop reductions the chain of
9542 // instructions leading from the loop exit instr to the phi need to be converted
9543 // to reductions, with one operand being vector and the other being the scalar
9544 // reduction chain. For other reductions, a select is introduced between the phi
9545 // and users outside the vector region when folding the tail.
9546 //
9547 // A ComputeReductionResult recipe is added to the middle block, also for
9548 // in-loop reductions which compute their result in-loop, because generating
9549 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9550 //
9551 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9552 // with a boolean reduction phi node to check if the condition is true in any
9553 // iteration. The final value is selected by the final ComputeReductionResult.
9554 void LoopVectorizationPlanner::adjustRecipesForReductions(
9555     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9556   using namespace VPlanPatternMatch;
9557   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9558   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9559   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9560   SmallVector<VPRecipeBase *> ToDelete;
9561 
9562   for (VPRecipeBase &R : Header->phis()) {
9563     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9564     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9565       continue;
9566 
9567     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9568     RecurKind Kind = RdxDesc.getRecurrenceKind();
9569     assert(
9570         !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9571         !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
9572         "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9573 
9574     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9575     SetVector<VPSingleDefRecipe *> Worklist;
9576     Worklist.insert(PhiR);
9577     for (unsigned I = 0; I != Worklist.size(); ++I) {
9578       VPSingleDefRecipe *Cur = Worklist[I];
9579       for (VPUser *U : Cur->users()) {
9580         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9581         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9582           assert((UserRecipe->getParent() == MiddleVPBB ||
9583                   UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9584                  "U must be either in the loop region, the middle block or the "
9585                  "scalar preheader.");
9586           continue;
9587         }
9588         Worklist.insert(UserRecipe);
9589       }
9590     }
9591 
9592     // Visit operation "Links" along the reduction chain top-down starting from
9593     // the phi until LoopExitValue. We keep track of the previous item
9594     // (PreviousLink) to tell which of the two operands of a Link will remain
9595     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9596     // the select instructions. Blend recipes of in-loop reduction phi's  will
9597     // get folded to their non-phi operand, as the reduction recipe handles the
9598     // condition directly.
9599     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9600     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9601       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9602 
9603       // Index of the first operand which holds a non-mask vector operand.
9604       unsigned IndexOfFirstOperand;
9605       // Recognize a call to the llvm.fmuladd intrinsic.
9606       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9607       VPValue *VecOp;
9608       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9609       if (IsFMulAdd) {
9610         assert(
9611             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9612             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9613         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9614                 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9615                CurrentLink->getOperand(2) == PreviousLink &&
9616                "expected a call where the previous link is the added operand");
9617 
9618         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9619         // need to create an fmul recipe (multiplying the first two operands of
9620         // the fmuladd together) to use as the vector operand for the fadd
9621         // reduction.
9622         VPInstruction *FMulRecipe = new VPInstruction(
9623             Instruction::FMul,
9624             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9625             CurrentLinkI->getFastMathFlags());
9626         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9627         VecOp = FMulRecipe;
9628       } else {
9629         auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9630         if (PhiR->isInLoop() && Blend) {
9631           assert(Blend->getNumIncomingValues() == 2 &&
9632                  "Blend must have 2 incoming values");
9633           if (Blend->getIncomingValue(0) == PhiR)
9634             Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9635           else {
9636             assert(Blend->getIncomingValue(1) == PhiR &&
9637                    "PhiR must be an operand of the blend");
9638             Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9639           }
9640           continue;
9641         }
9642 
9643         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9644           if (isa<VPWidenRecipe>(CurrentLink)) {
9645             assert(isa<CmpInst>(CurrentLinkI) &&
9646                    "need to have the compare of the select");
9647             continue;
9648           }
9649           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9650                  "must be a select recipe");
9651           IndexOfFirstOperand = 1;
9652         } else {
9653           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9654                  "Expected to replace a VPWidenSC");
9655           IndexOfFirstOperand = 0;
9656         }
9657         // Note that for non-commutable operands (cmp-selects), the semantics of
9658         // the cmp-select are captured in the recurrence kind.
9659         unsigned VecOpId =
9660             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9661                 ? IndexOfFirstOperand + 1
9662                 : IndexOfFirstOperand;
9663         VecOp = CurrentLink->getOperand(VecOpId);
9664         assert(VecOp != PreviousLink &&
9665                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9666                                        (VecOpId - IndexOfFirstOperand)) ==
9667                    PreviousLink &&
9668                "PreviousLink must be the operand other than VecOp");
9669       }
9670 
9671       BasicBlock *BB = CurrentLinkI->getParent();
9672       VPValue *CondOp = nullptr;
9673       if (CM.blockNeedsPredicationForAnyReason(BB))
9674         CondOp = RecipeBuilder.getBlockInMask(BB);
9675 
9676       auto *RedRecipe = new VPReductionRecipe(
9677           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9678           CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9679       // Append the recipe to the end of the VPBasicBlock because we need to
9680       // ensure that it comes after all of it's inputs, including CondOp.
9681       // Delete CurrentLink as it will be invalid if its operand is replaced
9682       // with a reduction defined at the bottom of the block in the next link.
9683       LinkVPBB->appendRecipe(RedRecipe);
9684       CurrentLink->replaceAllUsesWith(RedRecipe);
9685       ToDelete.push_back(CurrentLink);
9686       PreviousLink = RedRecipe;
9687     }
9688   }
9689   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9690   Builder.setInsertPoint(&*LatchVPBB->begin());
9691   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9692   for (VPRecipeBase &R :
9693        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9694     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9695     if (!PhiR)
9696       continue;
9697 
9698     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9699     // If tail is folded by masking, introduce selects between the phi
9700     // and the users outside the vector region of each reduction, at the
9701     // beginning of the dedicated latch block.
9702     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9703     auto *NewExitingVPV = PhiR->getBackedgeValue();
9704     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9705       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9706       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9707              "reduction recipe must be defined before latch");
9708       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9709       std::optional<FastMathFlags> FMFs =
9710           PhiTy->isFloatingPointTy()
9711               ? std::make_optional(RdxDesc.getFastMathFlags())
9712               : std::nullopt;
9713       NewExitingVPV =
9714           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9715       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9716         return isa<VPInstruction>(&U) &&
9717                cast<VPInstruction>(&U)->getOpcode() ==
9718                    VPInstruction::ComputeReductionResult;
9719       });
9720       if (CM.usePredicatedReductionSelect(
9721               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9722         PhiR->setOperand(1, NewExitingVPV);
9723     }
9724 
9725     // If the vector reduction can be performed in a smaller type, we truncate
9726     // then extend the loop exit value to enable InstCombine to evaluate the
9727     // entire expression in the smaller type.
9728     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9729     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9730         !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9731             RdxDesc.getRecurrenceKind())) {
9732       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9733       Type *RdxTy = RdxDesc.getRecurrenceType();
9734       auto *Trunc =
9735           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9736       auto *Extnd =
9737           RdxDesc.isSigned()
9738               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9739               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9740 
9741       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9742       Extnd->insertAfter(Trunc);
9743       if (PhiR->getOperand(1) == NewExitingVPV)
9744         PhiR->setOperand(1, Extnd->getVPSingleValue());
9745       NewExitingVPV = Extnd;
9746     }
9747 
9748     // We want code in the middle block to appear to execute on the location of
9749     // the scalar loop's latch terminator because: (a) it is all compiler
9750     // generated, (b) these instructions are always executed after evaluating
9751     // the latch conditional branch, and (c) other passes may add new
9752     // predecessors which terminate on this line. This is the easiest way to
9753     // ensure we don't accidentally cause an extra step back into the loop while
9754     // debugging.
9755     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9756 
9757     // TODO: At the moment ComputeReductionResult also drives creation of the
9758     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9759     // even for in-loop reductions, until the reduction resume value handling is
9760     // also modeled in VPlan.
9761     auto *FinalReductionResult = new VPInstruction(
9762         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9763     // Update all users outside the vector region.
9764     OrigExitingVPV->replaceUsesWithIf(
9765         FinalReductionResult, [](VPUser &User, unsigned) {
9766           auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9767           return Parent && !Parent->getParent();
9768         });
9769     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9770 
9771     // Adjust AnyOf reductions; replace the reduction phi for the selected value
9772     // with a boolean reduction phi node to check if the condition is true in
9773     // any iteration. The final value is selected by the final
9774     // ComputeReductionResult.
9775     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9776             RdxDesc.getRecurrenceKind())) {
9777       auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9778         return isa<VPWidenSelectRecipe>(U) ||
9779                (isa<VPReplicateRecipe>(U) &&
9780                 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9781                     Instruction::Select);
9782       }));
9783       VPValue *Cmp = Select->getOperand(0);
9784       // If the compare is checking the reduction PHI node, adjust it to check
9785       // the start value.
9786       if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9787         for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9788           if (CmpR->getOperand(I) == PhiR)
9789             CmpR->setOperand(I, PhiR->getStartValue());
9790       }
9791       VPBuilder::InsertPointGuard Guard(Builder);
9792       Builder.setInsertPoint(Select);
9793 
9794       // If the true value of the select is the reduction phi, the new value is
9795       // selected if the negated condition is true in any iteration.
9796       if (Select->getOperand(1) == PhiR)
9797         Cmp = Builder.createNot(Cmp);
9798       VPValue *Or = Builder.createOr(PhiR, Cmp);
9799       Select->getVPSingleValue()->replaceAllUsesWith(Or);
9800       // Delete Select now that it has invalid types.
9801       ToDelete.push_back(Select);
9802 
9803       // Convert the reduction phi to operate on bools.
9804       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9805                               OrigLoop->getHeader()->getContext())));
9806       continue;
9807     }
9808 
9809     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
9810             RdxDesc.getRecurrenceKind())) {
9811       // Adjust the start value for FindLastIV recurrences to use the sentinel
9812       // value after generating the ResumePhi recipe, which uses the original
9813       // start value.
9814       PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9815     }
9816   }
9817 
9818   VPlanTransforms::clearReductionWrapFlags(*Plan);
9819   for (VPRecipeBase *R : ToDelete)
9820     R->eraseFromParent();
9821 }
9822 
9823 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9824   assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9825 
9826   // Fast-math-flags propagate from the original induction instruction.
9827   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9828   if (FPBinOp)
9829     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9830 
9831   Value *Step = State.get(getStepValue(), VPLane(0));
9832   Value *Index = State.get(getOperand(1), VPLane(0));
9833   Value *DerivedIV = emitTransformedIndex(
9834       State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9835       cast_if_present<BinaryOperator>(FPBinOp));
9836   DerivedIV->setName(Name);
9837   // If index is the vector trip count, the concrete value will only be set in
9838   // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9839   // TODO: Remove the special case for the vector trip count once it is computed
9840   // in VPlan and can be used during VPlan simplification.
9841   assert((DerivedIV != Index ||
9842           getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9843          "IV didn't need transforming?");
9844   State.set(this, DerivedIV, VPLane(0));
9845 }
9846 
9847 void VPReplicateRecipe::execute(VPTransformState &State) {
9848   Instruction *UI = getUnderlyingInstr();
9849   if (State.Lane) { // Generate a single instance.
9850     assert((State.VF.isScalar() || !isUniform()) &&
9851            "uniform recipe shouldn't be predicated");
9852     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9853     State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
9854     // Insert scalar instance packing it into a vector.
9855     if (State.VF.isVector() && shouldPack()) {
9856       // If we're constructing lane 0, initialize to start from poison.
9857       if (State.Lane->isFirstLane()) {
9858         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9859         Value *Poison = PoisonValue::get(
9860             VectorType::get(UI->getType(), State.VF));
9861         State.set(this, Poison);
9862       }
9863       State.packScalarIntoVectorValue(this, *State.Lane);
9864     }
9865     return;
9866   }
9867 
9868   if (IsUniform) {
9869     // Uniform within VL means we need to generate lane 0.
9870     State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
9871     return;
9872   }
9873 
9874   // A store of a loop varying value to a uniform address only needs the last
9875   // copy of the store.
9876   if (isa<StoreInst>(UI) &&
9877       vputils::isUniformAfterVectorization(getOperand(1))) {
9878     auto Lane = VPLane::getLastLaneForVF(State.VF);
9879     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9880     return;
9881   }
9882 
9883   // Generate scalar instances for all VF lanes.
9884   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9885   const unsigned EndLane = State.VF.getKnownMinValue();
9886   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9887     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9888 }
9889 
9890 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9891 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9892 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9893 // for predication.
9894 static ScalarEpilogueLowering getScalarEpilogueLowering(
9895     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9896     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9897     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9898   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9899   // don't look at hints or options, and don't request a scalar epilogue.
9900   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9901   // LoopAccessInfo (due to code dependency and not being able to reliably get
9902   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9903   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9904   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9905   // back to the old way and vectorize with versioning when forced. See D81345.)
9906   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9907                                                       PGSOQueryType::IRPass) &&
9908                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9909     return CM_ScalarEpilogueNotAllowedOptSize;
9910 
9911   // 2) If set, obey the directives
9912   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9913     switch (PreferPredicateOverEpilogue) {
9914     case PreferPredicateTy::ScalarEpilogue:
9915       return CM_ScalarEpilogueAllowed;
9916     case PreferPredicateTy::PredicateElseScalarEpilogue:
9917       return CM_ScalarEpilogueNotNeededUsePredicate;
9918     case PreferPredicateTy::PredicateOrDontVectorize:
9919       return CM_ScalarEpilogueNotAllowedUsePredicate;
9920     };
9921   }
9922 
9923   // 3) If set, obey the hints
9924   switch (Hints.getPredicate()) {
9925   case LoopVectorizeHints::FK_Enabled:
9926     return CM_ScalarEpilogueNotNeededUsePredicate;
9927   case LoopVectorizeHints::FK_Disabled:
9928     return CM_ScalarEpilogueAllowed;
9929   };
9930 
9931   // 4) if the TTI hook indicates this is profitable, request predication.
9932   TailFoldingInfo TFI(TLI, &LVL, IAI);
9933   if (TTI->preferPredicateOverEpilogue(&TFI))
9934     return CM_ScalarEpilogueNotNeededUsePredicate;
9935 
9936   return CM_ScalarEpilogueAllowed;
9937 }
9938 
9939 // Process the loop in the VPlan-native vectorization path. This path builds
9940 // VPlan upfront in the vectorization pipeline, which allows to apply
9941 // VPlan-to-VPlan transformations from the very beginning without modifying the
9942 // input LLVM IR.
9943 static bool processLoopInVPlanNativePath(
9944     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9945     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9946     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9947     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9948     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9949     LoopVectorizationRequirements &Requirements) {
9950 
9951   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9952     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9953     return false;
9954   }
9955   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9956   Function *F = L->getHeader()->getParent();
9957   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9958 
9959   ScalarEpilogueLowering SEL =
9960       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9961 
9962   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9963                                 &Hints, IAI);
9964   // Use the planner for outer loop vectorization.
9965   // TODO: CM is not used at this point inside the planner. Turn CM into an
9966   // optional argument if we don't need it in the future.
9967   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9968                                ORE);
9969 
9970   // Get user vectorization factor.
9971   ElementCount UserVF = Hints.getWidth();
9972 
9973   CM.collectElementTypesForWidening();
9974 
9975   // Plan how to best vectorize, return the best VF and its cost.
9976   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9977 
9978   // If we are stress testing VPlan builds, do not attempt to generate vector
9979   // code. Masked vector code generation support will follow soon.
9980   // Also, do not attempt to vectorize if no vector code will be produced.
9981   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9982     return false;
9983 
9984   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9985 
9986   {
9987     bool AddBranchWeights =
9988         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9989     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
9990                              AddBranchWeights, CM.CostKind);
9991     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9992                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
9993     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9994                       << L->getHeader()->getParent()->getName() << "\"\n");
9995     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9996   }
9997 
9998   reportVectorization(ORE, L, VF, 1);
9999 
10000   // Mark the loop as already vectorized to avoid vectorizing again.
10001   Hints.setAlreadyVectorized();
10002   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10003   return true;
10004 }
10005 
10006 // Emit a remark if there are stores to floats that required a floating point
10007 // extension. If the vectorized loop was generated with floating point there
10008 // will be a performance penalty from the conversion overhead and the change in
10009 // the vector width.
10010 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10011   SmallVector<Instruction *, 4> Worklist;
10012   for (BasicBlock *BB : L->getBlocks()) {
10013     for (Instruction &Inst : *BB) {
10014       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10015         if (S->getValueOperand()->getType()->isFloatTy())
10016           Worklist.push_back(S);
10017       }
10018     }
10019   }
10020 
10021   // Traverse the floating point stores upwards searching, for floating point
10022   // conversions.
10023   SmallPtrSet<const Instruction *, 4> Visited;
10024   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10025   while (!Worklist.empty()) {
10026     auto *I = Worklist.pop_back_val();
10027     if (!L->contains(I))
10028       continue;
10029     if (!Visited.insert(I).second)
10030       continue;
10031 
10032     // Emit a remark if the floating point store required a floating
10033     // point conversion.
10034     // TODO: More work could be done to identify the root cause such as a
10035     // constant or a function return type and point the user to it.
10036     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10037       ORE->emit([&]() {
10038         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10039                                           I->getDebugLoc(), L->getHeader())
10040                << "floating point conversion changes vector width. "
10041                << "Mixed floating point precision requires an up/down "
10042                << "cast that will negatively impact performance.";
10043       });
10044 
10045     for (Use &Op : I->operands())
10046       if (auto *OpI = dyn_cast<Instruction>(Op))
10047         Worklist.push_back(OpI);
10048   }
10049 }
10050 
10051 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10052                                        VectorizationFactor &VF, Loop *L,
10053                                        const TargetTransformInfo &TTI,
10054                                        PredicatedScalarEvolution &PSE,
10055                                        ScalarEpilogueLowering SEL) {
10056   InstructionCost CheckCost = Checks.getCost();
10057   if (!CheckCost.isValid())
10058     return false;
10059 
10060   // When interleaving only scalar and vector cost will be equal, which in turn
10061   // would lead to a divide by 0. Fall back to hard threshold.
10062   if (VF.Width.isScalar()) {
10063     if (CheckCost > VectorizeMemoryCheckThreshold) {
10064       LLVM_DEBUG(
10065           dbgs()
10066           << "LV: Interleaving only is not profitable due to runtime checks\n");
10067       return false;
10068     }
10069     return true;
10070   }
10071 
10072   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10073   uint64_t ScalarC = *VF.ScalarCost.getValue();
10074   if (ScalarC == 0)
10075     return true;
10076 
10077   // First, compute the minimum iteration count required so that the vector
10078   // loop outperforms the scalar loop.
10079   //  The total cost of the scalar loop is
10080   //   ScalarC * TC
10081   //  where
10082   //  * TC is the actual trip count of the loop.
10083   //  * ScalarC is the cost of a single scalar iteration.
10084   //
10085   //  The total cost of the vector loop is
10086   //    RtC + VecC * (TC / VF) + EpiC
10087   //  where
10088   //  * RtC is the cost of the generated runtime checks
10089   //  * VecC is the cost of a single vector iteration.
10090   //  * TC is the actual trip count of the loop
10091   //  * VF is the vectorization factor
10092   //  * EpiCost is the cost of the generated epilogue, including the cost
10093   //    of the remaining scalar operations.
10094   //
10095   // Vectorization is profitable once the total vector cost is less than the
10096   // total scalar cost:
10097   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10098   //
10099   // Now we can compute the minimum required trip count TC as
10100   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10101   //
10102   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10103   // the computations are performed on doubles, not integers and the result
10104   // is rounded up, hence we get an upper estimate of the TC.
10105   unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10106   uint64_t RtC = *CheckCost.getValue();
10107   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10108   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10109 
10110   // Second, compute a minimum iteration count so that the cost of the
10111   // runtime checks is only a fraction of the total scalar loop cost. This
10112   // adds a loop-dependent bound on the overhead incurred if the runtime
10113   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10114   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10115   // cost, compute
10116   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10117   uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10118 
10119   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10120   // epilogue is allowed, choose the next closest multiple of VF. This should
10121   // partly compensate for ignoring the epilogue cost.
10122   uint64_t MinTC = std::max(MinTC1, MinTC2);
10123   if (SEL == CM_ScalarEpilogueAllowed)
10124     MinTC = alignTo(MinTC, IntVF);
10125   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
10126 
10127   LLVM_DEBUG(
10128       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10129              << VF.MinProfitableTripCount << "\n");
10130 
10131   // Skip vectorization if the expected trip count is less than the minimum
10132   // required trip count.
10133   if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10134     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10135                                 VF.MinProfitableTripCount)) {
10136       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10137                            "trip count < minimum profitable VF ("
10138                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10139                         << ")\n");
10140 
10141       return false;
10142     }
10143   }
10144   return true;
10145 }
10146 
10147 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10148     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10149                                !EnableLoopInterleaving),
10150       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10151                               !EnableLoopVectorization) {}
10152 
10153 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10154 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10155 /// don't have a corresponding wide induction in \p EpiPlan.
10156 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10157   // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10158   // will need their resume-values computed in the main vector loop. Others
10159   // can be removed from the main VPlan.
10160   SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10161   for (VPRecipeBase &R :
10162        EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
10163     if (isa<VPCanonicalIVPHIRecipe>(&R))
10164       continue;
10165     EpiWidenedPhis.insert(
10166         cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10167   }
10168   for (VPRecipeBase &R : make_early_inc_range(
10169            *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10170     auto *VPIRInst = cast<VPIRInstruction>(&R);
10171     auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10172     if (!IRI)
10173       break;
10174     if (EpiWidenedPhis.contains(IRI))
10175       continue;
10176     // There is no corresponding wide induction in the epilogue plan that would
10177     // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10178     // together with the corresponding ResumePhi. The resume values for the
10179     // scalar loop will be created during execution of EpiPlan.
10180     VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10181     VPIRInst->eraseFromParent();
10182     ResumePhi->eraseFromParent();
10183   }
10184   VPlanTransforms::removeDeadRecipes(MainPlan);
10185 
10186   using namespace VPlanPatternMatch;
10187   VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10188   VPValue *VectorTC = &MainPlan.getVectorTripCount();
10189   // If there is a suitable resume value for the canonical induction in the
10190   // scalar (which will become vector) epilogue loop we are done. Otherwise
10191   // create it below.
10192   if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10193         return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10194                              m_Specific(VectorTC), m_SpecificInt(0)));
10195       }))
10196     return;
10197   VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10198   ScalarPHBuilder.createNaryOp(
10199       VPInstruction::ResumePhi,
10200       {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10201       "vec.epilog.resume.val");
10202 }
10203 
10204 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10205 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10206 static void
10207 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
10208                                  const SCEV2ValueTy &ExpandedSCEVs,
10209                                  const EpilogueLoopVectorizationInfo &EPI) {
10210   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10211   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10212   Header->setName("vec.epilog.vector.body");
10213 
10214   // Re-use the trip count and steps expanded for the main loop, as
10215   // skeleton creation needs it as a value that dominates both the scalar
10216   // and vector epilogue loops
10217   // TODO: This is a workaround needed for epilogue vectorization and it
10218   // should be removed once induction resume value creation is done
10219   // directly in VPlan.
10220   for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10221     auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10222     if (!ExpandR)
10223       continue;
10224     auto *ExpandedVal =
10225         Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10226     ExpandR->replaceAllUsesWith(ExpandedVal);
10227     if (Plan.getTripCount() == ExpandR)
10228       Plan.resetTripCount(ExpandedVal);
10229     ExpandR->eraseFromParent();
10230   }
10231 
10232   // Ensure that the start values for all header phi recipes are updated before
10233   // vectorizing the epilogue loop.
10234   for (VPRecipeBase &R : Header->phis()) {
10235     if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10236       // When vectorizing the epilogue loop, the canonical induction start
10237       // value needs to be changed from zero to the value after the main
10238       // vector loop. Find the resume value created during execution of the main
10239       // VPlan.
10240       // FIXME: Improve modeling for canonical IV start values in the epilogue
10241       // loop.
10242       BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10243           predecessors(L->getLoopPreheader()),
10244           [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10245             if (BB != EPI.MainLoopIterationCountCheck &&
10246                 BB != EPI.EpilogueIterationCountCheck &&
10247                 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10248               return BB;
10249             return nullptr;
10250           });
10251       using namespace llvm::PatternMatch;
10252       Type *IdxTy = IV->getScalarType();
10253       PHINode *EPResumeVal = find_singleton<PHINode>(
10254           L->getLoopPreheader()->phis(),
10255           [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10256             if (P.getType() == IdxTy &&
10257                 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10258                 match(
10259                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10260                     m_SpecificInt(0)))
10261               return &P;
10262             return nullptr;
10263           });
10264       assert(EPResumeVal && "must have a resume value for the canonical IV");
10265       VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10266       assert(all_of(IV->users(),
10267                     [](const VPUser *U) {
10268                       return isa<VPScalarIVStepsRecipe>(U) ||
10269                              isa<VPScalarCastRecipe>(U) ||
10270                              isa<VPDerivedIVRecipe>(U) ||
10271                              cast<VPInstruction>(U)->getOpcode() ==
10272                                  Instruction::Add;
10273                     }) &&
10274              "the canonical IV should only be used by its increment or "
10275              "ScalarIVSteps when resetting the start value");
10276       IV->setOperand(0, VPV);
10277       continue;
10278     }
10279 
10280     Value *ResumeV = nullptr;
10281     // TODO: Move setting of resume values to prepareToExecute.
10282     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10283       ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10284                     ->getIncomingValueForBlock(L->getLoopPreheader());
10285       const RecurrenceDescriptor &RdxDesc =
10286           ReductionPhi->getRecurrenceDescriptor();
10287       RecurKind RK = RdxDesc.getRecurrenceKind();
10288       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
10289         // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10290         // start value; compare the final value from the main vector loop
10291         // to the start value.
10292         IRBuilder<> Builder(
10293             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10294         ResumeV =
10295             Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10296       } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
10297         // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10298         // to the resume value. The resume value is adjusted to the sentinel
10299         // value when the final value from the main vector loop equals the start
10300         // value. This ensures correctness when the start value might not be
10301         // less than the minimum value of a monotonically increasing induction
10302         // variable.
10303         IRBuilder<> Builder(
10304             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10305         Value *Cmp =
10306             Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10307         ResumeV =
10308             Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10309       }
10310     } else {
10311       // Retrieve the induction resume values for wide inductions from
10312       // their original phi nodes in the scalar loop.
10313       PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10314       // Hook up to the PHINode generated by a ResumePhi recipe of main
10315       // loop VPlan, which feeds the scalar loop.
10316       ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10317     }
10318     assert(ResumeV && "Must have a resume value");
10319     VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10320     cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10321   }
10322 }
10323 
10324 bool LoopVectorizePass::processLoop(Loop *L) {
10325   assert((EnableVPlanNativePath || L->isInnermost()) &&
10326          "VPlan-native path is not enabled. Only process inner loops.");
10327 
10328   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10329                     << L->getHeader()->getParent()->getName() << "' from "
10330                     << L->getLocStr() << "\n");
10331 
10332   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10333 
10334   LLVM_DEBUG(
10335       dbgs() << "LV: Loop hints:"
10336              << " force="
10337              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10338                      ? "disabled"
10339                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10340                             ? "enabled"
10341                             : "?"))
10342              << " width=" << Hints.getWidth()
10343              << " interleave=" << Hints.getInterleave() << "\n");
10344 
10345   // Function containing loop
10346   Function *F = L->getHeader()->getParent();
10347 
10348   // Looking at the diagnostic output is the only way to determine if a loop
10349   // was vectorized (other than looking at the IR or machine code), so it
10350   // is important to generate an optimization remark for each loop. Most of
10351   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10352   // generated as OptimizationRemark and OptimizationRemarkMissed are
10353   // less verbose reporting vectorized loops and unvectorized loops that may
10354   // benefit from vectorization, respectively.
10355 
10356   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10357     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10358     return false;
10359   }
10360 
10361   PredicatedScalarEvolution PSE(*SE, *L);
10362 
10363   // Check if it is legal to vectorize the loop.
10364   LoopVectorizationRequirements Requirements;
10365   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10366                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10367   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10368     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10369     Hints.emitRemarkWithHints();
10370     return false;
10371   }
10372 
10373   if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10374     reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10375                                "early exit is not enabled",
10376                                "UncountableEarlyExitLoopsDisabled", ORE, L);
10377     return false;
10378   }
10379 
10380   if (LVL.hasStructVectorCall()) {
10381     reportVectorizationFailure("Auto-vectorization of calls that return struct "
10382                                "types is not yet supported",
10383                                "StructCallVectorizationUnsupported", ORE, L);
10384     return false;
10385   }
10386 
10387   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10388   // here. They may require CFG and instruction level transformations before
10389   // even evaluating whether vectorization is profitable. Since we cannot modify
10390   // the incoming IR, we need to build VPlan upfront in the vectorization
10391   // pipeline.
10392   if (!L->isInnermost())
10393     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10394                                         ORE, BFI, PSI, Hints, Requirements);
10395 
10396   assert(L->isInnermost() && "Inner loop expected.");
10397 
10398   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10399   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10400 
10401   // If an override option has been passed in for interleaved accesses, use it.
10402   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10403     UseInterleaved = EnableInterleavedMemAccesses;
10404 
10405   // Analyze interleaved memory accesses.
10406   if (UseInterleaved)
10407     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10408 
10409   if (LVL.hasUncountableEarlyExit()) {
10410     BasicBlock *LoopLatch = L->getLoopLatch();
10411     if (IAI.requiresScalarEpilogue() ||
10412         any_of(LVL.getCountableExitingBlocks(),
10413                [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10414       reportVectorizationFailure("Auto-vectorization of early exit loops "
10415                                  "requiring a scalar epilogue is unsupported",
10416                                  "UncountableEarlyExitUnsupported", ORE, L);
10417       return false;
10418     }
10419   }
10420 
10421   // Check the function attributes and profiles to find out if this function
10422   // should be optimized for size.
10423   ScalarEpilogueLowering SEL =
10424       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10425 
10426   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10427   // count by optimizing for size, to minimize overheads.
10428   auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10429   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10430     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10431                       << "This loop is worth vectorizing only if no scalar "
10432                       << "iteration overheads are incurred.");
10433     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10434       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10435     else {
10436       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10437         LLVM_DEBUG(dbgs() << "\n");
10438         // Predicate tail-folded loops are efficient even when the loop
10439         // iteration count is low. However, setting the epilogue policy to
10440         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10441         // with runtime checks. It's more effective to let
10442         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10443         // for the loop.
10444         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10445           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10446       } else {
10447         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10448                              "small to consider vectorizing.\n");
10449         reportVectorizationFailure(
10450             "The trip count is below the minial threshold value.",
10451             "loop trip count is too low, avoiding vectorization",
10452             "LowTripCount", ORE, L);
10453         Hints.emitRemarkWithHints();
10454         return false;
10455       }
10456     }
10457   }
10458 
10459   // Check the function attributes to see if implicit floats or vectors are
10460   // allowed.
10461   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10462     reportVectorizationFailure(
10463         "Can't vectorize when the NoImplicitFloat attribute is used",
10464         "loop not vectorized due to NoImplicitFloat attribute",
10465         "NoImplicitFloat", ORE, L);
10466     Hints.emitRemarkWithHints();
10467     return false;
10468   }
10469 
10470   // Check if the target supports potentially unsafe FP vectorization.
10471   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10472   // for the target we're vectorizing for, to make sure none of the
10473   // additional fp-math flags can help.
10474   if (Hints.isPotentiallyUnsafe() &&
10475       TTI->isFPVectorizationPotentiallyUnsafe()) {
10476     reportVectorizationFailure(
10477         "Potentially unsafe FP op prevents vectorization",
10478         "loop not vectorized due to unsafe FP support.",
10479         "UnsafeFP", ORE, L);
10480     Hints.emitRemarkWithHints();
10481     return false;
10482   }
10483 
10484   bool AllowOrderedReductions;
10485   // If the flag is set, use that instead and override the TTI behaviour.
10486   if (ForceOrderedReductions.getNumOccurrences() > 0)
10487     AllowOrderedReductions = ForceOrderedReductions;
10488   else
10489     AllowOrderedReductions = TTI->enableOrderedReductions();
10490   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10491     ORE->emit([&]() {
10492       auto *ExactFPMathInst = Requirements.getExactFPInst();
10493       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10494                                                  ExactFPMathInst->getDebugLoc(),
10495                                                  ExactFPMathInst->getParent())
10496              << "loop not vectorized: cannot prove it is safe to reorder "
10497                 "floating-point operations";
10498     });
10499     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10500                          "reorder floating-point operations\n");
10501     Hints.emitRemarkWithHints();
10502     return false;
10503   }
10504 
10505   // Use the cost model.
10506   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10507                                 F, &Hints, IAI);
10508   // Use the planner for vectorization.
10509   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10510                                ORE);
10511 
10512   // Get user vectorization factor and interleave count.
10513   ElementCount UserVF = Hints.getWidth();
10514   unsigned UserIC = Hints.getInterleave();
10515 
10516   // Plan how to best vectorize.
10517   LVP.plan(UserVF, UserIC);
10518   VectorizationFactor VF = LVP.computeBestVF();
10519   unsigned IC = 1;
10520 
10521   if (ORE->allowExtraAnalysis(LV_NAME))
10522     LVP.emitInvalidCostRemarks(ORE);
10523 
10524   bool AddBranchWeights =
10525       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10526   GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10527                            AddBranchWeights, CM.CostKind);
10528   if (LVP.hasPlanWithVF(VF.Width)) {
10529     // Select the interleave count.
10530     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10531 
10532     unsigned SelectedIC = std::max(IC, UserIC);
10533     //  Optimistically generate runtime checks if they are needed. Drop them if
10534     //  they turn out to not be profitable.
10535     if (VF.Width.isVector() || SelectedIC > 1)
10536       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10537 
10538     // Check if it is profitable to vectorize with runtime checks.
10539     bool ForceVectorization =
10540         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10541     if (!ForceVectorization &&
10542         !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10543       ORE->emit([&]() {
10544         return OptimizationRemarkAnalysisAliasing(
10545                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10546                    L->getHeader())
10547                << "loop not vectorized: cannot prove it is safe to reorder "
10548                   "memory operations";
10549       });
10550       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10551       Hints.emitRemarkWithHints();
10552       return false;
10553     }
10554   }
10555 
10556   // Identify the diagnostic messages that should be produced.
10557   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10558   bool VectorizeLoop = true, InterleaveLoop = true;
10559   if (VF.Width.isScalar()) {
10560     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10561     VecDiagMsg = std::make_pair(
10562         "VectorizationNotBeneficial",
10563         "the cost-model indicates that vectorization is not beneficial");
10564     VectorizeLoop = false;
10565   }
10566 
10567   if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10568     // Tell the user interleaving was avoided up-front, despite being explicitly
10569     // requested.
10570     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10571                          "interleaving should be avoided up front\n");
10572     IntDiagMsg = std::make_pair(
10573         "InterleavingAvoided",
10574         "Ignoring UserIC, because interleaving was avoided up front");
10575     InterleaveLoop = false;
10576   } else if (IC == 1 && UserIC <= 1) {
10577     // Tell the user interleaving is not beneficial.
10578     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10579     IntDiagMsg = std::make_pair(
10580         "InterleavingNotBeneficial",
10581         "the cost-model indicates that interleaving is not beneficial");
10582     InterleaveLoop = false;
10583     if (UserIC == 1) {
10584       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10585       IntDiagMsg.second +=
10586           " and is explicitly disabled or interleave count is set to 1";
10587     }
10588   } else if (IC > 1 && UserIC == 1) {
10589     // Tell the user interleaving is beneficial, but it explicitly disabled.
10590     LLVM_DEBUG(
10591         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10592     IntDiagMsg = std::make_pair(
10593         "InterleavingBeneficialButDisabled",
10594         "the cost-model indicates that interleaving is beneficial "
10595         "but is explicitly disabled or interleave count is set to 1");
10596     InterleaveLoop = false;
10597   }
10598 
10599   // If there is a histogram in the loop, do not just interleave without
10600   // vectorizing. The order of operations will be incorrect without the
10601   // histogram intrinsics, which are only used for recipes with VF > 1.
10602   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10603     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10604                       << "to histogram operations.\n");
10605     IntDiagMsg = std::make_pair(
10606         "HistogramPreventsScalarInterleaving",
10607         "Unable to interleave without vectorization due to constraints on "
10608         "the order of histogram operations");
10609     InterleaveLoop = false;
10610   }
10611 
10612   // Override IC if user provided an interleave count.
10613   IC = UserIC > 0 ? UserIC : IC;
10614 
10615   // Emit diagnostic messages, if any.
10616   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10617   if (!VectorizeLoop && !InterleaveLoop) {
10618     // Do not vectorize or interleaving the loop.
10619     ORE->emit([&]() {
10620       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10621                                       L->getStartLoc(), L->getHeader())
10622              << VecDiagMsg.second;
10623     });
10624     ORE->emit([&]() {
10625       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10626                                       L->getStartLoc(), L->getHeader())
10627              << IntDiagMsg.second;
10628     });
10629     return false;
10630   }
10631 
10632   if (!VectorizeLoop && InterleaveLoop) {
10633     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10634     ORE->emit([&]() {
10635       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10636                                         L->getStartLoc(), L->getHeader())
10637              << VecDiagMsg.second;
10638     });
10639   } else if (VectorizeLoop && !InterleaveLoop) {
10640     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10641                       << ") in " << L->getLocStr() << '\n');
10642     ORE->emit([&]() {
10643       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10644                                         L->getStartLoc(), L->getHeader())
10645              << IntDiagMsg.second;
10646     });
10647   } else if (VectorizeLoop && InterleaveLoop) {
10648     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10649                       << ") in " << L->getLocStr() << '\n');
10650     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10651   }
10652 
10653   bool DisableRuntimeUnroll = false;
10654   MDNode *OrigLoopID = L->getLoopID();
10655   {
10656     using namespace ore;
10657     if (!VectorizeLoop) {
10658       assert(IC > 1 && "interleave count should not be 1 or 0");
10659       // If we decided that it is not legal to vectorize the loop, then
10660       // interleave it.
10661       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10662       InnerLoopVectorizer Unroller(
10663           L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10664           ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10665 
10666       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10667 
10668       ORE->emit([&]() {
10669         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10670                                   L->getHeader())
10671                << "interleaved loop (interleaved count: "
10672                << NV("InterleaveCount", IC) << ")";
10673       });
10674     } else {
10675       // If we decided that it is *legal* to vectorize the loop, then do it.
10676 
10677       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10678       // Consider vectorizing the epilogue too if it's profitable.
10679       VectorizationFactor EpilogueVF =
10680           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10681       if (EpilogueVF.Width.isVector()) {
10682         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10683 
10684         // The first pass vectorizes the main loop and creates a scalar epilogue
10685         // to be vectorized by executing the plan (potentially with a different
10686         // factor) again shortly afterwards.
10687         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10688         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10689         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10690                                           BestEpiPlan);
10691         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10692                                            EPI, &LVL, &CM, BFI, PSI, Checks,
10693                                            *BestMainPlan);
10694         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10695                                              *BestMainPlan, MainILV, DT, false);
10696         ++LoopsVectorized;
10697 
10698         // Second pass vectorizes the epilogue and adjusts the control flow
10699         // edges from the first pass.
10700         EPI.MainLoopVF = EPI.EpilogueVF;
10701         EPI.MainLoopUF = EPI.EpilogueUF;
10702         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10703                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10704                                                  Checks, BestEpiPlan);
10705         EpilogILV.setTripCount(MainILV.getTripCount());
10706         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10707 
10708         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10709                         DT, true, &ExpandedSCEVs);
10710         ++LoopsEpilogueVectorized;
10711 
10712         if (!MainILV.areSafetyChecksAdded())
10713           DisableRuntimeUnroll = true;
10714       } else {
10715         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10716                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10717                                PSI, Checks, BestPlan);
10718         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10719         ++LoopsVectorized;
10720 
10721         // Add metadata to disable runtime unrolling a scalar loop when there
10722         // are no runtime checks about strides and memory. A scalar loop that is
10723         // rarely used is not worth unrolling.
10724         if (!LB.areSafetyChecksAdded())
10725           DisableRuntimeUnroll = true;
10726       }
10727       // Report the vectorization decision.
10728       reportVectorization(ORE, L, VF, IC);
10729     }
10730 
10731     if (ORE->allowExtraAnalysis(LV_NAME))
10732       checkMixedPrecision(L, ORE);
10733   }
10734 
10735   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10736          "DT not preserved correctly");
10737 
10738   std::optional<MDNode *> RemainderLoopID =
10739       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10740                                       LLVMLoopVectorizeFollowupEpilogue});
10741   if (RemainderLoopID) {
10742     L->setLoopID(*RemainderLoopID);
10743   } else {
10744     if (DisableRuntimeUnroll)
10745       addRuntimeUnrollDisableMetaData(L);
10746 
10747     // Mark the loop as already vectorized to avoid vectorizing again.
10748     Hints.setAlreadyVectorized();
10749   }
10750 
10751   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10752   return true;
10753 }
10754 
10755 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10756 
10757   // Don't attempt if
10758   // 1. the target claims to have no vector registers, and
10759   // 2. interleaving won't help ILP.
10760   //
10761   // The second condition is necessary because, even if the target has no
10762   // vector registers, loop vectorization may still enable scalar
10763   // interleaving.
10764   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10765       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10766     return LoopVectorizeResult(false, false);
10767 
10768   bool Changed = false, CFGChanged = false;
10769 
10770   // The vectorizer requires loops to be in simplified form.
10771   // Since simplification may add new inner loops, it has to run before the
10772   // legality and profitability checks. This means running the loop vectorizer
10773   // will simplify all loops, regardless of whether anything end up being
10774   // vectorized.
10775   for (const auto &L : *LI)
10776     Changed |= CFGChanged |=
10777         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10778 
10779   // Build up a worklist of inner-loops to vectorize. This is necessary as
10780   // the act of vectorizing or partially unrolling a loop creates new loops
10781   // and can invalidate iterators across the loops.
10782   SmallVector<Loop *, 8> Worklist;
10783 
10784   for (Loop *L : *LI)
10785     collectSupportedLoops(*L, LI, ORE, Worklist);
10786 
10787   LoopsAnalyzed += Worklist.size();
10788 
10789   // Now walk the identified inner loops.
10790   while (!Worklist.empty()) {
10791     Loop *L = Worklist.pop_back_val();
10792 
10793     // For the inner loops we actually process, form LCSSA to simplify the
10794     // transform.
10795     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10796 
10797     Changed |= CFGChanged |= processLoop(L);
10798 
10799     if (Changed) {
10800       LAIs->clear();
10801 
10802 #ifndef NDEBUG
10803       if (VerifySCEV)
10804         SE->verify();
10805 #endif
10806     }
10807   }
10808 
10809   // Process each loop nest in the function.
10810   return LoopVectorizeResult(Changed, CFGChanged);
10811 }
10812 
10813 PreservedAnalyses LoopVectorizePass::run(Function &F,
10814                                          FunctionAnalysisManager &AM) {
10815   LI = &AM.getResult<LoopAnalysis>(F);
10816   // There are no loops in the function. Return before computing other
10817   // expensive analyses.
10818   if (LI->empty())
10819     return PreservedAnalyses::all();
10820   SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
10821   TTI = &AM.getResult<TargetIRAnalysis>(F);
10822   DT = &AM.getResult<DominatorTreeAnalysis>(F);
10823   TLI = &AM.getResult<TargetLibraryAnalysis>(F);
10824   AC = &AM.getResult<AssumptionAnalysis>(F);
10825   DB = &AM.getResult<DemandedBitsAnalysis>(F);
10826   ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10827   LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10828 
10829   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10830   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10831   BFI = nullptr;
10832   if (PSI && PSI->hasProfileSummary())
10833     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10834   LoopVectorizeResult Result = runImpl(F);
10835   if (!Result.MadeAnyChange)
10836     return PreservedAnalyses::all();
10837   PreservedAnalyses PA;
10838 
10839   if (isAssignmentTrackingEnabled(*F.getParent())) {
10840     for (auto &BB : F)
10841       RemoveRedundantDbgInstrs(&BB);
10842   }
10843 
10844   PA.preserve<LoopAnalysis>();
10845   PA.preserve<DominatorTreeAnalysis>();
10846   PA.preserve<ScalarEvolutionAnalysis>();
10847   PA.preserve<LoopAccessAnalysis>();
10848 
10849   if (Result.MadeCFGChange) {
10850     // Making CFG changes likely means a loop got vectorized. Indicate that
10851     // extra simplification passes should be run.
10852     // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10853     // be run if runtime checks have been added.
10854     AM.getResult<ShouldRunExtraVectorPasses>(F);
10855     PA.preserve<ShouldRunExtraVectorPasses>();
10856   } else {
10857     PA.preserveSet<CFGAnalyses>();
10858   }
10859   return PA;
10860 }
10861 
10862 void LoopVectorizePass::printPipeline(
10863     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10864   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10865       OS, MapClassName2PassName);
10866 
10867   OS << '<';
10868   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10869   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10870   OS << '>';
10871 }
10872