xref: /llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 304a99091c84f303ff5037dc6bf5455e4cfde7a1)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanPatternMatch.h"
63 #include "VPlanTransforms.h"
64 #include "VPlanUtils.h"
65 #include "VPlanVerifier.h"
66 #include "llvm/ADT/APInt.h"
67 #include "llvm/ADT/ArrayRef.h"
68 #include "llvm/ADT/DenseMap.h"
69 #include "llvm/ADT/DenseMapInfo.h"
70 #include "llvm/ADT/Hashing.h"
71 #include "llvm/ADT/MapVector.h"
72 #include "llvm/ADT/STLExtras.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/TypeSwitch.h"
79 #include "llvm/ADT/iterator_range.h"
80 #include "llvm/Analysis/AssumptionCache.h"
81 #include "llvm/Analysis/BasicAliasAnalysis.h"
82 #include "llvm/Analysis/BlockFrequencyInfo.h"
83 #include "llvm/Analysis/CFG.h"
84 #include "llvm/Analysis/CodeMetrics.h"
85 #include "llvm/Analysis/DemandedBits.h"
86 #include "llvm/Analysis/GlobalsModRef.h"
87 #include "llvm/Analysis/LoopAccessAnalysis.h"
88 #include "llvm/Analysis/LoopAnalysisManager.h"
89 #include "llvm/Analysis/LoopInfo.h"
90 #include "llvm/Analysis/LoopIterator.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/ValueTracking.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfo.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/MDBuilder.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/PatternMatch.h"
122 #include "llvm/IR/ProfDataUtils.h"
123 #include "llvm/IR/Type.h"
124 #include "llvm/IR/Use.h"
125 #include "llvm/IR/User.h"
126 #include "llvm/IR/Value.h"
127 #include "llvm/IR/Verifier.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/NativeFormatting.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/Local.h"
139 #include "llvm/Transforms/Utils/LoopSimplify.h"
140 #include "llvm/Transforms/Utils/LoopUtils.h"
141 #include "llvm/Transforms/Utils/LoopVersioning.h"
142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143 #include "llvm/Transforms/Utils/SizeOpts.h"
144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <algorithm>
146 #include <cassert>
147 #include <cstdint>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251                    "data-and-control-without-rt-check",
252                    "Similar to data-and-control, but remove the runtime check"),
253         clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254                    "Use predicated EVL instructions for tail folding. If EVL "
255                    "is unsupported, fallback to data-without-lane-mask.")));
256 
257 static cl::opt<bool> MaximizeBandwidth(
258     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259     cl::desc("Maximize bandwidth when selecting vectorization factor which "
260              "will be determined by the smallest type in loop."));
261 
262 static cl::opt<bool> EnableInterleavedMemAccesses(
263     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265 
266 /// An interleave-group may need masking if it resides in a block that needs
267 /// predication, or in order to mask away gaps.
268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
269     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271 
272 static cl::opt<unsigned> ForceTargetNumScalarRegs(
273     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's number of scalar registers."));
275 
276 static cl::opt<unsigned> ForceTargetNumVectorRegs(
277     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278     cl::desc("A flag that overrides the target's number of vector registers."));
279 
280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
281     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282     cl::desc("A flag that overrides the target's max interleave factor for "
283              "scalar loops."));
284 
285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
286     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287     cl::desc("A flag that overrides the target's max interleave factor for "
288              "vectorized loops."));
289 
290 cl::opt<unsigned> ForceTargetInstructionCost(
291     "force-target-instruction-cost", cl::init(0), cl::Hidden,
292     cl::desc("A flag that overrides the target's expected cost for "
293              "an instruction to a single constant value. Mostly "
294              "useful for getting consistent testing."));
295 
296 static cl::opt<bool> ForceTargetSupportsScalableVectors(
297     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298     cl::desc(
299         "Pretend that scalable vectors are supported, even if the target does "
300         "not support them. This flag should only be used for testing."));
301 
302 static cl::opt<unsigned> SmallLoopCost(
303     "small-loop-cost", cl::init(20), cl::Hidden,
304     cl::desc(
305         "The cost of a loop that is considered 'small' by the interleaver."));
306 
307 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
308     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309     cl::desc("Enable the use of the block frequency analysis to access PGO "
310              "heuristics minimizing code growth in cold regions and being more "
311              "aggressive in hot regions."));
312 
313 // Runtime interleave loops for load/store throughput.
314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
315     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316     cl::desc(
317         "Enable runtime interleaving until load/store ports are saturated"));
318 
319 /// The number of stores in a loop that are allowed to need predication.
320 static cl::opt<unsigned> NumberOfStoresToPredicate(
321     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322     cl::desc("Max number of stores to be predicated behind an if."));
323 
324 static cl::opt<bool> EnableIndVarRegisterHeur(
325     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326     cl::desc("Count the induction variable only once when interleaving"));
327 
328 static cl::opt<bool> EnableCondStoresVectorization(
329     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330     cl::desc("Enable if predication of stores during vectorization."));
331 
332 static cl::opt<unsigned> MaxNestedScalarReductionIC(
333     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334     cl::desc("The maximum interleave count to use when interleaving a scalar "
335              "reduction in a nested loop."));
336 
337 static cl::opt<bool>
338     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
339                            cl::Hidden,
340                            cl::desc("Prefer in-loop vector reductions, "
341                                     "overriding the targets preference."));
342 
343 static cl::opt<bool> ForceOrderedReductions(
344     "force-ordered-reductions", cl::init(false), cl::Hidden,
345     cl::desc("Enable the vectorisation of loops with in-order (strict) "
346              "FP reductions"));
347 
348 static cl::opt<bool> PreferPredicatedReductionSelect(
349     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350     cl::desc(
351         "Prefer predicating a reduction operation over an after loop select."));
352 
353 namespace llvm {
354 cl::opt<bool> EnableVPlanNativePath(
355     "enable-vplan-native-path", cl::Hidden,
356     cl::desc("Enable VPlan-native vectorization path with "
357              "support for outer loop vectorization."));
358 } // namespace llvm
359 
360 // This flag enables the stress testing of the VPlan H-CFG construction in the
361 // VPlan-native vectorization path. It must be used in conjuction with
362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363 // verification of the H-CFGs built.
364 static cl::opt<bool> VPlanBuildStressTest(
365     "vplan-build-stress-test", cl::init(false), cl::Hidden,
366     cl::desc(
367         "Build VPlan for every supported loop nest in the function and bail "
368         "out right after the build (stress test the VPlan H-CFG construction "
369         "in the VPlan-native vectorization path)."));
370 
371 cl::opt<bool> llvm::EnableLoopInterleaving(
372     "interleave-loops", cl::init(true), cl::Hidden,
373     cl::desc("Enable loop interleaving in Loop vectorization passes"));
374 cl::opt<bool> llvm::EnableLoopVectorization(
375     "vectorize-loops", cl::init(true), cl::Hidden,
376     cl::desc("Run the Loop vectorization passes"));
377 
378 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
379     "force-widen-divrem-via-safe-divisor", cl::Hidden,
380     cl::desc(
381         "Override cost based safe divisor widening for div/rem instructions"));
382 
383 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
384     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
385     cl::Hidden,
386     cl::desc("Try wider VFs if they enable the use of vector variants"));
387 
388 static cl::opt<bool> EnableEarlyExitVectorization(
389     "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390     cl::desc(
391         "Enable vectorization of early exit loops with uncountable exits."));
392 
393 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394 // variables not overflowing do not hold. See `emitSCEVChecks`.
395 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
397 // `emitMemRuntimeChecks`.
398 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399 // Likelyhood of bypassing the vectorized loop because there are zero trips left
400 // after prolog. See `emitIterationCountCheck`.
401 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402 
403 /// A helper function that returns true if the given type is irregular. The
404 /// type is irregular if its allocated size doesn't equal the store size of an
405 /// element of the corresponding vector type.
406 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407   // Determine if an array of N elements of type Ty is "bitcast compatible"
408   // with a <N x Ty> vector.
409   // This is only true if there is no padding between the array elements.
410   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411 }
412 
413 /// Returns "best known" trip count for the specified loop \p L as defined by
414 /// the following procedure:
415 ///   1) Returns exact trip count if it is known.
416 ///   2) Returns expected trip count according to profile data if any.
417 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
418 ///   4) Returns std::nullopt if all of the above failed.
419 static std::optional<unsigned>
420 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
421                     bool CanUseConstantMax = true) {
422   // Check if exact trip count is known.
423   if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
424     return ExpectedTC;
425 
426   // Check if there is an expected trip count available from profile data.
427   if (LoopVectorizeWithBlockFrequency)
428     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
429       return *EstimatedTC;
430 
431   if (!CanUseConstantMax)
432     return std::nullopt;
433 
434   // Check if upper bound estimate is known.
435   if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
436     return ExpectedTC;
437 
438   return std::nullopt;
439 }
440 
441 namespace {
442 // Forward declare GeneratedRTChecks.
443 class GeneratedRTChecks;
444 
445 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446 } // namespace
447 
448 namespace llvm {
449 
450 AnalysisKey ShouldRunExtraVectorPasses::Key;
451 
452 /// InnerLoopVectorizer vectorizes loops which contain only one basic
453 /// block to a specified vectorization factor (VF).
454 /// This class performs the widening of scalars into vectors, or multiple
455 /// scalars. This class also implements the following features:
456 /// * It inserts an epilogue loop for handling loops that don't have iteration
457 ///   counts that are known to be a multiple of the vectorization factor.
458 /// * It handles the code generation for reduction variables.
459 /// * Scalarization (implementation using scalars) of un-vectorizable
460 ///   instructions.
461 /// InnerLoopVectorizer does not perform any vectorization-legality
462 /// checks, and relies on the caller to check for the different legality
463 /// aspects. The InnerLoopVectorizer relies on the
464 /// LoopVectorizationLegality class to provide information about the induction
465 /// and reduction variables that were found to a given vectorization factor.
466 class InnerLoopVectorizer {
467 public:
468   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
469                       LoopInfo *LI, DominatorTree *DT,
470                       const TargetLibraryInfo *TLI,
471                       const TargetTransformInfo *TTI, AssumptionCache *AC,
472                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
473                       ElementCount MinProfitableTripCount,
474                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
475                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
476                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
477                       VPlan &Plan)
478       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
479         AC(AC), ORE(ORE), VF(VecWidth),
480         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
481         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
482         PSI(PSI), RTChecks(RTChecks), Plan(Plan),
483         VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
484     // Query this against the original loop and save it here because the profile
485     // of the original loop header may change as the transformation happens.
486     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
487         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
488   }
489 
490   virtual ~InnerLoopVectorizer() = default;
491 
492   /// Create a new empty loop that will contain vectorized instructions later
493   /// on, while the old loop will be used as the scalar remainder. Control flow
494   /// is generated around the vectorized (and scalar epilogue) loops consisting
495   /// of various checks and bypasses. Return the pre-header block of the new
496   /// loop. In the case of epilogue vectorization, this function is overriden to
497   /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
498   /// used to look up SCEV expansions for expressions needed during skeleton
499   /// creation.
500   virtual BasicBlock *
501   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502 
503   /// Fix the vectorized code, taking care of header phi's, and more.
504   void fixVectorizedLoop(VPTransformState &State);
505 
506   // Return true if any runtime check is added.
507   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
508 
509   /// A helper function to scalarize a single Instruction in the innermost loop.
510   /// Generates a sequence of scalar instances for each lane between \p MinLane
511   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
512   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
513   /// Instr's operands.
514   void scalarizeInstruction(const Instruction *Instr,
515                             VPReplicateRecipe *RepRecipe, const VPLane &Lane,
516                             VPTransformState &State);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPTransformState &State);
520 
521   /// Returns the original loop trip count.
522   Value *getTripCount() const { return TripCount; }
523 
524   /// Used to set the trip count after ILV's construction and after the
525   /// preheader block has been executed. Note that this always holds the trip
526   /// count of the original loop for both main loop and epilogue vectorization.
527   void setTripCount(Value *TC) { TripCount = TC; }
528 
529   // Retrieve the additional bypass value associated with an original
530   /// induction header phi.
531   Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
532     return Induction2AdditionalBypassValue.at(OrigPhi);
533   }
534 
535   /// Return the additional bypass block which targets the scalar loop by
536   /// skipping the epilogue loop after completing the main loop.
537   BasicBlock *getAdditionalBypassBlock() const {
538     assert(AdditionalBypassBlock &&
539            "Trying to access AdditionalBypassBlock but it has not been set");
540     return AdditionalBypassBlock;
541   }
542 
543 protected:
544   friend class LoopVectorizationPlanner;
545 
546   /// Iteratively sink the scalarized operands of a predicated instruction into
547   /// the block that was created for it.
548   void sinkScalarOperands(Instruction *PredInst);
549 
550   /// Returns (and creates if needed) the trip count of the widened loop.
551   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
552 
553   /// Emit a bypass check to see if the vector trip count is zero, including if
554   /// it overflows.
555   void emitIterationCountCheck(BasicBlock *Bypass);
556 
557   /// Emit a bypass check to see if all of the SCEV assumptions we've
558   /// had to make are correct. Returns the block containing the checks or
559   /// nullptr if no checks have been added.
560   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
561 
562   /// Emit bypass checks to check any memory assumptions we may have made.
563   /// Returns the block containing the checks or nullptr if no checks have been
564   /// added.
565   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
566 
567   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
568   /// vector loop preheader, middle block and scalar preheader.
569   void createVectorLoopSkeleton(StringRef Prefix);
570 
571   /// Create and record the values for induction variables to resume coming from
572   /// the additional bypass block.
573   void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
574                                              Value *MainVectorTripCount);
575 
576   /// Allow subclasses to override and print debug traces before/after vplan
577   /// execution, when trace information is requested.
578   virtual void printDebugTracesAtStart() {}
579   virtual void printDebugTracesAtEnd() {}
580 
581   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
582   /// vector preheader and its predecessor, also connecting the new block to the
583   /// scalar preheader.
584   void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
585 
586   /// The original loop.
587   Loop *OrigLoop;
588 
589   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
590   /// dynamic knowledge to simplify SCEV expressions and converts them to a
591   /// more usable form.
592   PredicatedScalarEvolution &PSE;
593 
594   /// Loop Info.
595   LoopInfo *LI;
596 
597   /// Dominator Tree.
598   DominatorTree *DT;
599 
600   /// Target Library Info.
601   const TargetLibraryInfo *TLI;
602 
603   /// Target Transform Info.
604   const TargetTransformInfo *TTI;
605 
606   /// Assumption Cache.
607   AssumptionCache *AC;
608 
609   /// Interface to emit optimization remarks.
610   OptimizationRemarkEmitter *ORE;
611 
612   /// The vectorization SIMD factor to use. Each vector will have this many
613   /// vector elements.
614   ElementCount VF;
615 
616   ElementCount MinProfitableTripCount;
617 
618   /// The vectorization unroll factor to use. Each scalar is vectorized to this
619   /// many different vector instructions.
620   unsigned UF;
621 
622   /// The builder that we use
623   IRBuilder<> Builder;
624 
625   // --- Vectorization state ---
626 
627   /// The vector-loop preheader.
628   BasicBlock *LoopVectorPreHeader;
629 
630   /// The scalar-loop preheader.
631   BasicBlock *LoopScalarPreHeader;
632 
633   /// Middle Block between the vector and the scalar.
634   BasicBlock *LoopMiddleBlock;
635 
636   /// A list of all bypass blocks. The first block is the entry of the loop.
637   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
638 
639   /// Store instructions that were predicated.
640   SmallVector<Instruction *, 4> PredicatedInstructions;
641 
642   /// Trip count of the original loop.
643   Value *TripCount = nullptr;
644 
645   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
646   Value *VectorTripCount = nullptr;
647 
648   /// The legality analysis.
649   LoopVectorizationLegality *Legal;
650 
651   /// The profitablity analysis.
652   LoopVectorizationCostModel *Cost;
653 
654   // Record whether runtime checks are added.
655   bool AddedSafetyChecks = false;
656 
657   /// BFI and PSI are used to check for profile guided size optimizations.
658   BlockFrequencyInfo *BFI;
659   ProfileSummaryInfo *PSI;
660 
661   // Whether this loop should be optimized for size based on profile guided size
662   // optimizatios.
663   bool OptForSizeBasedOnProfile;
664 
665   /// Structure to hold information about generated runtime checks, responsible
666   /// for cleaning the checks, if vectorization turns out unprofitable.
667   GeneratedRTChecks &RTChecks;
668 
669   /// Mapping of induction phis to their additional bypass values. They
670   /// need to be added as operands to phi nodes in the scalar loop preheader
671   /// after the epilogue skeleton has been created.
672   DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
673 
674   /// The additional bypass block which conditionally skips over the epilogue
675   /// loop after executing the main loop. Needed to resume inductions and
676   /// reductions during epilogue vectorization.
677   BasicBlock *AdditionalBypassBlock = nullptr;
678 
679   VPlan &Plan;
680 
681   /// The vector preheader block of \p Plan, used as target for check blocks
682   /// introduced during skeleton creation.
683   VPBlockBase *VectorPHVPB;
684 };
685 
686 /// Encapsulate information regarding vectorization of a loop and its epilogue.
687 /// This information is meant to be updated and used across two stages of
688 /// epilogue vectorization.
689 struct EpilogueLoopVectorizationInfo {
690   ElementCount MainLoopVF = ElementCount::getFixed(0);
691   unsigned MainLoopUF = 0;
692   ElementCount EpilogueVF = ElementCount::getFixed(0);
693   unsigned EpilogueUF = 0;
694   BasicBlock *MainLoopIterationCountCheck = nullptr;
695   BasicBlock *EpilogueIterationCountCheck = nullptr;
696   BasicBlock *SCEVSafetyCheck = nullptr;
697   BasicBlock *MemSafetyCheck = nullptr;
698   Value *TripCount = nullptr;
699   Value *VectorTripCount = nullptr;
700   VPlan &EpiloguePlan;
701 
702   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
703                                 ElementCount EVF, unsigned EUF,
704                                 VPlan &EpiloguePlan)
705       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
706         EpiloguePlan(EpiloguePlan) {
707     assert(EUF == 1 &&
708            "A high UF for the epilogue loop is likely not beneficial.");
709   }
710 };
711 
712 /// An extension of the inner loop vectorizer that creates a skeleton for a
713 /// vectorized loop that has its epilogue (residual) also vectorized.
714 /// The idea is to run the vplan on a given loop twice, firstly to setup the
715 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
716 /// from the first step and vectorize the epilogue.  This is achieved by
717 /// deriving two concrete strategy classes from this base class and invoking
718 /// them in succession from the loop vectorizer planner.
719 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
720 public:
721   InnerLoopAndEpilogueVectorizer(
722       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
723       DominatorTree *DT, const TargetLibraryInfo *TLI,
724       const TargetTransformInfo *TTI, AssumptionCache *AC,
725       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
726       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
727       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
728       GeneratedRTChecks &Checks, VPlan &Plan)
729       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
730                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
731                             CM, BFI, PSI, Checks, Plan),
732         EPI(EPI) {}
733 
734   // Override this function to handle the more complex control flow around the
735   // three loops.
736   BasicBlock *
737   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
738     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
739   }
740 
741   /// The interface for creating a vectorized skeleton using one of two
742   /// different strategies, each corresponding to one execution of the vplan
743   /// as described above.
744   virtual BasicBlock *
745   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
746 
747   /// Holds and updates state information required to vectorize the main loop
748   /// and its epilogue in two separate passes. This setup helps us avoid
749   /// regenerating and recomputing runtime safety checks. It also helps us to
750   /// shorten the iteration-count-check path length for the cases where the
751   /// iteration count of the loop is so small that the main vector loop is
752   /// completely skipped.
753   EpilogueLoopVectorizationInfo &EPI;
754 };
755 
756 /// A specialized derived class of inner loop vectorizer that performs
757 /// vectorization of *main* loops in the process of vectorizing loops and their
758 /// epilogues.
759 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
760 public:
761   EpilogueVectorizerMainLoop(
762       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
763       DominatorTree *DT, const TargetLibraryInfo *TLI,
764       const TargetTransformInfo *TTI, AssumptionCache *AC,
765       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
766       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
767       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
768       GeneratedRTChecks &Check, VPlan &Plan)
769       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
770                                        EPI, LVL, CM, BFI, PSI, Check, Plan) {}
771   /// Implements the interface for creating a vectorized skeleton using the
772   /// *main loop* strategy (ie the first pass of vplan execution).
773   BasicBlock *
774   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
775 
776 protected:
777   /// Emits an iteration count bypass check once for the main loop (when \p
778   /// ForEpilogue is false) and once for the epilogue loop (when \p
779   /// ForEpilogue is true).
780   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
781   void printDebugTracesAtStart() override;
782   void printDebugTracesAtEnd() override;
783 };
784 
785 // A specialized derived class of inner loop vectorizer that performs
786 // vectorization of *epilogue* loops in the process of vectorizing loops and
787 // their epilogues.
788 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
789 public:
790   EpilogueVectorizerEpilogueLoop(
791       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
792       DominatorTree *DT, const TargetLibraryInfo *TLI,
793       const TargetTransformInfo *TTI, AssumptionCache *AC,
794       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
795       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
796       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
797       GeneratedRTChecks &Checks, VPlan &Plan)
798       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
799                                        EPI, LVL, CM, BFI, PSI, Checks, Plan) {
800     TripCount = EPI.TripCount;
801   }
802   /// Implements the interface for creating a vectorized skeleton using the
803   /// *epilogue loop* strategy (ie the second pass of vplan execution).
804   BasicBlock *
805   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
806 
807 protected:
808   /// Emits an iteration count bypass check after the main vector loop has
809   /// finished to see if there are any iterations left to execute by either
810   /// the vector epilogue or the scalar epilogue.
811   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
812                                                       BasicBlock *Bypass,
813                                                       BasicBlock *Insert);
814   void printDebugTracesAtStart() override;
815   void printDebugTracesAtEnd() override;
816 };
817 } // end namespace llvm
818 
819 /// Look for a meaningful debug location on the instruction or its operands.
820 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
821   if (!I)
822     return DebugLoc();
823 
824   DebugLoc Empty;
825   if (I->getDebugLoc() != Empty)
826     return I->getDebugLoc();
827 
828   for (Use &Op : I->operands()) {
829     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
830       if (OpInst->getDebugLoc() != Empty)
831         return OpInst->getDebugLoc();
832   }
833 
834   return I->getDebugLoc();
835 }
836 
837 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
838 /// is passed, the message relates to that particular instruction.
839 #ifndef NDEBUG
840 static void debugVectorizationMessage(const StringRef Prefix,
841                                       const StringRef DebugMsg,
842                                       Instruction *I) {
843   dbgs() << "LV: " << Prefix << DebugMsg;
844   if (I != nullptr)
845     dbgs() << " " << *I;
846   else
847     dbgs() << '.';
848   dbgs() << '\n';
849 }
850 #endif
851 
852 /// Create an analysis remark that explains why vectorization failed
853 ///
854 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
855 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
856 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
857 /// the location of the remark. If \p DL is passed, use it as debug location for
858 /// the remark. \return the remark object that can be streamed to.
859 static OptimizationRemarkAnalysis
860 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
861                  Instruction *I, DebugLoc DL = {}) {
862   Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
863   // If debug location is attached to the instruction, use it. Otherwise if DL
864   // was not provided, use the loop's.
865   if (I && I->getDebugLoc())
866     DL = I->getDebugLoc();
867   else if (!DL)
868     DL = TheLoop->getStartLoc();
869 
870   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
871 }
872 
873 namespace llvm {
874 
875 /// Return a value for Step multiplied by VF.
876 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
877                        int64_t Step) {
878   assert(Ty->isIntegerTy() && "Expected an integer step");
879   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
880 }
881 
882 /// Return the runtime value for VF.
883 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
884   return B.CreateElementCount(Ty, VF);
885 }
886 
887 void reportVectorizationFailure(const StringRef DebugMsg,
888                                 const StringRef OREMsg, const StringRef ORETag,
889                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
890                                 Instruction *I) {
891   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
892   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
893   ORE->emit(
894       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
895       << "loop not vectorized: " << OREMsg);
896 }
897 
898 /// Reports an informative message: print \p Msg for debugging purposes as well
899 /// as an optimization remark. Uses either \p I as location of the remark, or
900 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
901 /// remark. If \p DL is passed, use it as debug location for the remark.
902 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
903                                     OptimizationRemarkEmitter *ORE,
904                                     Loop *TheLoop, Instruction *I = nullptr,
905                                     DebugLoc DL = {}) {
906   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
907   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
908   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
909                              I, DL)
910             << Msg);
911 }
912 
913 /// Report successful vectorization of the loop. In case an outer loop is
914 /// vectorized, prepend "outer" to the vectorization remark.
915 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
916                                 VectorizationFactor VF, unsigned IC) {
917   LLVM_DEBUG(debugVectorizationMessage(
918       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
919       nullptr));
920   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
921   ORE->emit([&]() {
922     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
923                               TheLoop->getHeader())
924            << "vectorized " << LoopType << "loop (vectorization width: "
925            << ore::NV("VectorizationFactor", VF.Width)
926            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
927   });
928 }
929 
930 } // end namespace llvm
931 
932 namespace llvm {
933 
934 // Loop vectorization cost-model hints how the scalar epilogue loop should be
935 // lowered.
936 enum ScalarEpilogueLowering {
937 
938   // The default: allowing scalar epilogues.
939   CM_ScalarEpilogueAllowed,
940 
941   // Vectorization with OptForSize: don't allow epilogues.
942   CM_ScalarEpilogueNotAllowedOptSize,
943 
944   // A special case of vectorisation with OptForSize: loops with a very small
945   // trip count are considered for vectorization under OptForSize, thereby
946   // making sure the cost of their loop body is dominant, free of runtime
947   // guards and scalar iteration overheads.
948   CM_ScalarEpilogueNotAllowedLowTripLoop,
949 
950   // Loop hint predicate indicating an epilogue is undesired.
951   CM_ScalarEpilogueNotNeededUsePredicate,
952 
953   // Directive indicating we must either tail fold or not vectorize
954   CM_ScalarEpilogueNotAllowedUsePredicate
955 };
956 
957 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
958 
959 /// LoopVectorizationCostModel - estimates the expected speedups due to
960 /// vectorization.
961 /// In many cases vectorization is not profitable. This can happen because of
962 /// a number of reasons. In this class we mainly attempt to predict the
963 /// expected speedup/slowdowns due to the supported instruction set. We use the
964 /// TargetTransformInfo to query the different backends for the cost of
965 /// different operations.
966 class LoopVectorizationCostModel {
967   friend class LoopVectorizationPlanner;
968 
969 public:
970   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
971                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
972                              LoopVectorizationLegality *Legal,
973                              const TargetTransformInfo &TTI,
974                              const TargetLibraryInfo *TLI, DemandedBits *DB,
975                              AssumptionCache *AC,
976                              OptimizationRemarkEmitter *ORE, const Function *F,
977                              const LoopVectorizeHints *Hints,
978                              InterleavedAccessInfo &IAI)
979       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
980         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
981         Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
982 
983   /// \return An upper bound for the vectorization factors (both fixed and
984   /// scalable). If the factors are 0, vectorization and interleaving should be
985   /// avoided up front.
986   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
987 
988   /// \return True if runtime checks are required for vectorization, and false
989   /// otherwise.
990   bool runtimeChecksRequired();
991 
992   /// Setup cost-based decisions for user vectorization factor.
993   /// \return true if the UserVF is a feasible VF to be chosen.
994   bool selectUserVectorizationFactor(ElementCount UserVF) {
995     collectUniformsAndScalars(UserVF);
996     collectInstsToScalarize(UserVF);
997     return expectedCost(UserVF).isValid();
998   }
999 
1000   /// \return The size (in bits) of the smallest and widest types in the code
1001   /// that needs to be vectorized. We ignore values that remain scalar such as
1002   /// 64 bit loop indices.
1003   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1004 
1005   /// \return The desired interleave count.
1006   /// If interleave count has been specified by metadata it will be returned.
1007   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1008   /// are the selected vectorization factor and the cost of the selected VF.
1009   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1010 
1011   /// Memory access instruction may be vectorized in more than one way.
1012   /// Form of instruction after vectorization depends on cost.
1013   /// This function takes cost-based decisions for Load/Store instructions
1014   /// and collects them in a map. This decisions map is used for building
1015   /// the lists of loop-uniform and loop-scalar instructions.
1016   /// The calculated cost is saved with widening decision in order to
1017   /// avoid redundant calculations.
1018   void setCostBasedWideningDecision(ElementCount VF);
1019 
1020   /// A call may be vectorized in different ways depending on whether we have
1021   /// vectorized variants available and whether the target supports masking.
1022   /// This function analyzes all calls in the function at the supplied VF,
1023   /// makes a decision based on the costs of available options, and stores that
1024   /// decision in a map for use in planning and plan execution.
1025   void setVectorizedCallDecision(ElementCount VF);
1026 
1027   /// A struct that represents some properties of the register usage
1028   /// of a loop.
1029   struct RegisterUsage {
1030     /// Holds the number of loop invariant values that are used in the loop.
1031     /// The key is ClassID of target-provided register class.
1032     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1033     /// Holds the maximum number of concurrent live intervals in the loop.
1034     /// The key is ClassID of target-provided register class.
1035     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1036   };
1037 
1038   /// \return Returns information about the register usages of the loop for the
1039   /// given vectorization factors.
1040   SmallVector<RegisterUsage, 8>
1041   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1042 
1043   /// Collect values we want to ignore in the cost model.
1044   void collectValuesToIgnore();
1045 
1046   /// Collect all element types in the loop for which widening is needed.
1047   void collectElementTypesForWidening();
1048 
1049   /// Split reductions into those that happen in the loop, and those that happen
1050   /// outside. In loop reductions are collected into InLoopReductions.
1051   void collectInLoopReductions();
1052 
1053   /// Returns true if we should use strict in-order reductions for the given
1054   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1055   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1056   /// of FP operations.
1057   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1058     return !Hints->allowReordering() && RdxDesc.isOrdered();
1059   }
1060 
1061   /// \returns The smallest bitwidth each instruction can be represented with.
1062   /// The vector equivalents of these instructions should be truncated to this
1063   /// type.
1064   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1065     return MinBWs;
1066   }
1067 
1068   /// \returns True if it is more profitable to scalarize instruction \p I for
1069   /// vectorization factor \p VF.
1070   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1071     assert(VF.isVector() &&
1072            "Profitable to scalarize relevant only for VF > 1.");
1073     assert(
1074         TheLoop->isInnermost() &&
1075         "cost-model should not be used for outer loops (in VPlan-native path)");
1076 
1077     auto Scalars = InstsToScalarize.find(VF);
1078     assert(Scalars != InstsToScalarize.end() &&
1079            "VF not yet analyzed for scalarization profitability");
1080     return Scalars->second.contains(I);
1081   }
1082 
1083   /// Returns true if \p I is known to be uniform after vectorization.
1084   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1085     assert(
1086         TheLoop->isInnermost() &&
1087         "cost-model should not be used for outer loops (in VPlan-native path)");
1088     // Pseudo probe needs to be duplicated for each unrolled iteration and
1089     // vector lane so that profiled loop trip count can be accurately
1090     // accumulated instead of being under counted.
1091     if (isa<PseudoProbeInst>(I))
1092       return false;
1093 
1094     if (VF.isScalar())
1095       return true;
1096 
1097     auto UniformsPerVF = Uniforms.find(VF);
1098     assert(UniformsPerVF != Uniforms.end() &&
1099            "VF not yet analyzed for uniformity");
1100     return UniformsPerVF->second.count(I);
1101   }
1102 
1103   /// Returns true if \p I is known to be scalar after vectorization.
1104   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1105     assert(
1106         TheLoop->isInnermost() &&
1107         "cost-model should not be used for outer loops (in VPlan-native path)");
1108     if (VF.isScalar())
1109       return true;
1110 
1111     auto ScalarsPerVF = Scalars.find(VF);
1112     assert(ScalarsPerVF != Scalars.end() &&
1113            "Scalar values are not calculated for VF");
1114     return ScalarsPerVF->second.count(I);
1115   }
1116 
1117   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1118   /// for vectorization factor \p VF.
1119   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1120     return VF.isVector() && MinBWs.contains(I) &&
1121            !isProfitableToScalarize(I, VF) &&
1122            !isScalarAfterVectorization(I, VF);
1123   }
1124 
1125   /// Decision that was taken during cost calculation for memory instruction.
1126   enum InstWidening {
1127     CM_Unknown,
1128     CM_Widen,         // For consecutive accesses with stride +1.
1129     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1130     CM_Interleave,
1131     CM_GatherScatter,
1132     CM_Scalarize,
1133     CM_VectorCall,
1134     CM_IntrinsicCall
1135   };
1136 
1137   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1138   /// instruction \p I and vector width \p VF.
1139   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1140                            InstructionCost Cost) {
1141     assert(VF.isVector() && "Expected VF >=2");
1142     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1143   }
1144 
1145   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1146   /// interleaving group \p Grp and vector width \p VF.
1147   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1148                            ElementCount VF, InstWidening W,
1149                            InstructionCost Cost) {
1150     assert(VF.isVector() && "Expected VF >=2");
1151     /// Broadcast this decicion to all instructions inside the group.
1152     /// When interleaving, the cost will only be assigned one instruction, the
1153     /// insert position. For other cases, add the appropriate fraction of the
1154     /// total cost to each instruction. This ensures accurate costs are used,
1155     /// even if the insert position instruction is not used.
1156     InstructionCost InsertPosCost = Cost;
1157     InstructionCost OtherMemberCost = 0;
1158     if (W != CM_Interleave)
1159       OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1160     ;
1161     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1162       if (auto *I = Grp->getMember(Idx)) {
1163         if (Grp->getInsertPos() == I)
1164           WideningDecisions[std::make_pair(I, VF)] =
1165               std::make_pair(W, InsertPosCost);
1166         else
1167           WideningDecisions[std::make_pair(I, VF)] =
1168               std::make_pair(W, OtherMemberCost);
1169       }
1170     }
1171   }
1172 
1173   /// Return the cost model decision for the given instruction \p I and vector
1174   /// width \p VF. Return CM_Unknown if this instruction did not pass
1175   /// through the cost modeling.
1176   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1177     assert(VF.isVector() && "Expected VF to be a vector VF");
1178     assert(
1179         TheLoop->isInnermost() &&
1180         "cost-model should not be used for outer loops (in VPlan-native path)");
1181 
1182     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1183     auto Itr = WideningDecisions.find(InstOnVF);
1184     if (Itr == WideningDecisions.end())
1185       return CM_Unknown;
1186     return Itr->second.first;
1187   }
1188 
1189   /// Return the vectorization cost for the given instruction \p I and vector
1190   /// width \p VF.
1191   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1192     assert(VF.isVector() && "Expected VF >=2");
1193     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1194     assert(WideningDecisions.contains(InstOnVF) &&
1195            "The cost is not calculated");
1196     return WideningDecisions[InstOnVF].second;
1197   }
1198 
1199   struct CallWideningDecision {
1200     InstWidening Kind;
1201     Function *Variant;
1202     Intrinsic::ID IID;
1203     std::optional<unsigned> MaskPos;
1204     InstructionCost Cost;
1205   };
1206 
1207   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1208                                Function *Variant, Intrinsic::ID IID,
1209                                std::optional<unsigned> MaskPos,
1210                                InstructionCost Cost) {
1211     assert(!VF.isScalar() && "Expected vector VF");
1212     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1213                                                      MaskPos, Cost};
1214   }
1215 
1216   CallWideningDecision getCallWideningDecision(CallInst *CI,
1217                                                ElementCount VF) const {
1218     assert(!VF.isScalar() && "Expected vector VF");
1219     return CallWideningDecisions.at(std::make_pair(CI, VF));
1220   }
1221 
1222   /// Return True if instruction \p I is an optimizable truncate whose operand
1223   /// is an induction variable. Such a truncate will be removed by adding a new
1224   /// induction variable with the destination type.
1225   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1226     // If the instruction is not a truncate, return false.
1227     auto *Trunc = dyn_cast<TruncInst>(I);
1228     if (!Trunc)
1229       return false;
1230 
1231     // Get the source and destination types of the truncate.
1232     Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1233     Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1234 
1235     // If the truncate is free for the given types, return false. Replacing a
1236     // free truncate with an induction variable would add an induction variable
1237     // update instruction to each iteration of the loop. We exclude from this
1238     // check the primary induction variable since it will need an update
1239     // instruction regardless.
1240     Value *Op = Trunc->getOperand(0);
1241     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1242       return false;
1243 
1244     // If the truncated value is not an induction variable, return false.
1245     return Legal->isInductionPhi(Op);
1246   }
1247 
1248   /// Collects the instructions to scalarize for each predicated instruction in
1249   /// the loop.
1250   void collectInstsToScalarize(ElementCount VF);
1251 
1252   /// Collect Uniform and Scalar values for the given \p VF.
1253   /// The sets depend on CM decision for Load/Store instructions
1254   /// that may be vectorized as interleave, gather-scatter or scalarized.
1255   /// Also make a decision on what to do about call instructions in the loop
1256   /// at that VF -- scalarize, call a known vector routine, or call a
1257   /// vector intrinsic.
1258   void collectUniformsAndScalars(ElementCount VF) {
1259     // Do the analysis once.
1260     if (VF.isScalar() || Uniforms.contains(VF))
1261       return;
1262     setCostBasedWideningDecision(VF);
1263     collectLoopUniforms(VF);
1264     setVectorizedCallDecision(VF);
1265     collectLoopScalars(VF);
1266   }
1267 
1268   /// Returns true if the target machine supports masked store operation
1269   /// for the given \p DataType and kind of access to \p Ptr.
1270   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1271     return Legal->isConsecutivePtr(DataType, Ptr) &&
1272            TTI.isLegalMaskedStore(DataType, Alignment);
1273   }
1274 
1275   /// Returns true if the target machine supports masked load operation
1276   /// for the given \p DataType and kind of access to \p Ptr.
1277   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1278     return Legal->isConsecutivePtr(DataType, Ptr) &&
1279            TTI.isLegalMaskedLoad(DataType, Alignment);
1280   }
1281 
1282   /// Returns true if the target machine can represent \p V as a masked gather
1283   /// or scatter operation.
1284   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1285     bool LI = isa<LoadInst>(V);
1286     bool SI = isa<StoreInst>(V);
1287     if (!LI && !SI)
1288       return false;
1289     auto *Ty = getLoadStoreType(V);
1290     Align Align = getLoadStoreAlignment(V);
1291     if (VF.isVector())
1292       Ty = VectorType::get(Ty, VF);
1293     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1294            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1295   }
1296 
1297   /// Returns true if the target machine supports all of the reduction
1298   /// variables found for the given VF.
1299   bool canVectorizeReductions(ElementCount VF) const {
1300     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1301       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1302       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1303     }));
1304   }
1305 
1306   /// Given costs for both strategies, return true if the scalar predication
1307   /// lowering should be used for div/rem.  This incorporates an override
1308   /// option so it is not simply a cost comparison.
1309   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1310                                      InstructionCost SafeDivisorCost) const {
1311     switch (ForceSafeDivisor) {
1312     case cl::BOU_UNSET:
1313       return ScalarCost < SafeDivisorCost;
1314     case cl::BOU_TRUE:
1315       return false;
1316     case cl::BOU_FALSE:
1317       return true;
1318     }
1319     llvm_unreachable("impossible case value");
1320   }
1321 
1322   /// Returns true if \p I is an instruction which requires predication and
1323   /// for which our chosen predication strategy is scalarization (i.e. we
1324   /// don't have an alternate strategy such as masking available).
1325   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1326   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1327 
1328   /// Returns true if \p I is an instruction that needs to be predicated
1329   /// at runtime.  The result is independent of the predication mechanism.
1330   /// Superset of instructions that return true for isScalarWithPredication.
1331   bool isPredicatedInst(Instruction *I) const;
1332 
1333   /// Return the costs for our two available strategies for lowering a
1334   /// div/rem operation which requires speculating at least one lane.
1335   /// First result is for scalarization (will be invalid for scalable
1336   /// vectors); second is for the safe-divisor strategy.
1337   std::pair<InstructionCost, InstructionCost>
1338   getDivRemSpeculationCost(Instruction *I,
1339                            ElementCount VF) const;
1340 
1341   /// Returns true if \p I is a memory instruction with consecutive memory
1342   /// access that can be widened.
1343   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1344 
1345   /// Returns true if \p I is a memory instruction in an interleaved-group
1346   /// of memory accesses that can be vectorized with wide vector loads/stores
1347   /// and shuffles.
1348   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1349 
1350   /// Check if \p Instr belongs to any interleaved access group.
1351   bool isAccessInterleaved(Instruction *Instr) const {
1352     return InterleaveInfo.isInterleaved(Instr);
1353   }
1354 
1355   /// Get the interleaved access group that \p Instr belongs to.
1356   const InterleaveGroup<Instruction> *
1357   getInterleavedAccessGroup(Instruction *Instr) const {
1358     return InterleaveInfo.getInterleaveGroup(Instr);
1359   }
1360 
1361   /// Returns true if we're required to use a scalar epilogue for at least
1362   /// the final iteration of the original loop.
1363   bool requiresScalarEpilogue(bool IsVectorizing) const {
1364     if (!isScalarEpilogueAllowed()) {
1365       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1366       return false;
1367     }
1368     // If we might exit from anywhere but the latch and early exit vectorization
1369     // is disabled, we must run the exiting iteration in scalar form.
1370     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1371         !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1372       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1373                            "from latch block\n");
1374       return true;
1375     }
1376     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1377       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1378                            "interleaved group requires scalar epilogue\n");
1379       return true;
1380     }
1381     LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1382     return false;
1383   }
1384 
1385   /// Returns true if we're required to use a scalar epilogue for at least
1386   /// the final iteration of the original loop for all VFs in \p Range.
1387   /// A scalar epilogue must either be required for all VFs in \p Range or for
1388   /// none.
1389   bool requiresScalarEpilogue(VFRange Range) const {
1390     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1391       return requiresScalarEpilogue(VF.isVector());
1392     };
1393     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1394     assert(
1395         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1396         "all VFs in range must agree on whether a scalar epilogue is required");
1397     return IsRequired;
1398   }
1399 
1400   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1401   /// loop hint annotation.
1402   bool isScalarEpilogueAllowed() const {
1403     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1404   }
1405 
1406   /// Returns the TailFoldingStyle that is best for the current loop.
1407   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1408     if (!ChosenTailFoldingStyle)
1409       return TailFoldingStyle::None;
1410     return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1411                                : ChosenTailFoldingStyle->second;
1412   }
1413 
1414   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1415   /// overflow or not.
1416   /// \param IsScalableVF true if scalable vector factors enabled.
1417   /// \param UserIC User specific interleave count.
1418   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1419     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1420     if (!Legal->canFoldTailByMasking()) {
1421       ChosenTailFoldingStyle =
1422           std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1423       return;
1424     }
1425 
1426     if (!ForceTailFoldingStyle.getNumOccurrences()) {
1427       ChosenTailFoldingStyle = std::make_pair(
1428           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1429           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1430       return;
1431     }
1432 
1433     // Set styles when forced.
1434     ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1435                                             ForceTailFoldingStyle.getValue());
1436     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1437       return;
1438     // Override forced styles if needed.
1439     // FIXME: use actual opcode/data type for analysis here.
1440     // FIXME: Investigate opportunity for fixed vector factor.
1441     // FIXME: support fixed-order recurrences by fixing splice of non VFxUF
1442     // penultimate EVL.
1443     bool EVLIsLegal =
1444         UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1445         !EnableVPlanNativePath && Legal->getFixedOrderRecurrences().empty();
1446     if (!EVLIsLegal) {
1447       // If for some reason EVL mode is unsupported, fallback to
1448       // DataWithoutLaneMask to try to vectorize the loop with folded tail
1449       // in a generic way.
1450       ChosenTailFoldingStyle =
1451           std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1452                          TailFoldingStyle::DataWithoutLaneMask);
1453       LLVM_DEBUG(
1454           dbgs()
1455           << "LV: Preference for VP intrinsics indicated. Will "
1456              "not try to generate VP Intrinsics "
1457           << (UserIC > 1
1458                   ? "since interleave count specified is greater than 1.\n"
1459                   : "due to non-interleaving reasons.\n"));
1460     }
1461   }
1462 
1463   /// Returns true if all loop blocks should be masked to fold tail loop.
1464   bool foldTailByMasking() const {
1465     // TODO: check if it is possible to check for None style independent of
1466     // IVUpdateMayOverflow flag in getTailFoldingStyle.
1467     return getTailFoldingStyle() != TailFoldingStyle::None;
1468   }
1469 
1470   /// Return maximum safe number of elements to be processed per vector
1471   /// iteration, which do not prevent store-load forwarding and are safe with
1472   /// regard to the memory dependencies. Required for EVL-based VPlans to
1473   /// correctly calculate AVL (application vector length) as min(remaining AVL,
1474   /// MaxSafeElements).
1475   /// TODO: need to consider adjusting cost model to use this value as a
1476   /// vectorization factor for EVL-based vectorization.
1477   std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1478 
1479   /// Returns true if the instructions in this block requires predication
1480   /// for any reason, e.g. because tail folding now requires a predicate
1481   /// or because the block in the original loop was predicated.
1482   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1483     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1484   }
1485 
1486   /// Returns true if VP intrinsics with explicit vector length support should
1487   /// be generated in the tail folded loop.
1488   bool foldTailWithEVL() const {
1489     return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1490   }
1491 
1492   /// Returns true if the Phi is part of an inloop reduction.
1493   bool isInLoopReduction(PHINode *Phi) const {
1494     return InLoopReductions.contains(Phi);
1495   }
1496 
1497   /// Returns true if the predicated reduction select should be used to set the
1498   /// incoming value for the reduction phi.
1499   bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1500     // Force to use predicated reduction select since the EVL of the
1501     // second-to-last iteration might not be VF*UF.
1502     if (foldTailWithEVL())
1503       return true;
1504     return PreferPredicatedReductionSelect ||
1505            TTI.preferPredicatedReductionSelect(
1506                Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1507   }
1508 
1509   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1510   /// with factor VF.  Return the cost of the instruction, including
1511   /// scalarization overhead if it's needed.
1512   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1513 
1514   /// Estimate cost of a call instruction CI if it were vectorized with factor
1515   /// VF. Return the cost of the instruction, including scalarization overhead
1516   /// if it's needed.
1517   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1518 
1519   /// Invalidates decisions already taken by the cost model.
1520   void invalidateCostModelingDecisions() {
1521     WideningDecisions.clear();
1522     CallWideningDecisions.clear();
1523     Uniforms.clear();
1524     Scalars.clear();
1525   }
1526 
1527   /// Returns the expected execution cost. The unit of the cost does
1528   /// not matter because we use the 'cost' units to compare different
1529   /// vector widths. The cost that is returned is *not* normalized by
1530   /// the factor width.
1531   InstructionCost expectedCost(ElementCount VF);
1532 
1533   bool hasPredStores() const { return NumPredStores > 0; }
1534 
1535   /// Returns true if epilogue vectorization is considered profitable, and
1536   /// false otherwise.
1537   /// \p VF is the vectorization factor chosen for the original loop.
1538   /// \p Multiplier is an aditional scaling factor applied to VF before
1539   /// comparing to EpilogueVectorizationMinVF.
1540   bool isEpilogueVectorizationProfitable(const ElementCount VF,
1541                                          const unsigned IC) const;
1542 
1543   /// Returns the execution time cost of an instruction for a given vector
1544   /// width. Vector width of one means scalar.
1545   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1546 
1547   /// Return the cost of instructions in an inloop reduction pattern, if I is
1548   /// part of that pattern.
1549   std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1550                                                          ElementCount VF,
1551                                                          Type *VectorTy) const;
1552 
1553   /// Returns true if \p Op should be considered invariant and if it is
1554   /// trivially hoistable.
1555   bool shouldConsiderInvariant(Value *Op);
1556 
1557 private:
1558   unsigned NumPredStores = 0;
1559 
1560   /// \return An upper bound for the vectorization factors for both
1561   /// fixed and scalable vectorization, where the minimum-known number of
1562   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1563   /// disabled or unsupported, then the scalable part will be equal to
1564   /// ElementCount::getScalable(0).
1565   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1566                                            ElementCount UserVF,
1567                                            bool FoldTailByMasking);
1568 
1569   /// \return the maximized element count based on the targets vector
1570   /// registers and the loop trip-count, but limited to a maximum safe VF.
1571   /// This is a helper function of computeFeasibleMaxVF.
1572   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1573                                        unsigned SmallestType,
1574                                        unsigned WidestType,
1575                                        ElementCount MaxSafeVF,
1576                                        bool FoldTailByMasking);
1577 
1578   /// Checks if scalable vectorization is supported and enabled. Caches the
1579   /// result to avoid repeated debug dumps for repeated queries.
1580   bool isScalableVectorizationAllowed();
1581 
1582   /// \return the maximum legal scalable VF, based on the safe max number
1583   /// of elements.
1584   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1585 
1586   /// Calculate vectorization cost of memory instruction \p I.
1587   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1588 
1589   /// The cost computation for scalarized memory instruction.
1590   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1591 
1592   /// The cost computation for interleaving group of memory instructions.
1593   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1594 
1595   /// The cost computation for Gather/Scatter instruction.
1596   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1597 
1598   /// The cost computation for widening instruction \p I with consecutive
1599   /// memory access.
1600   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1601 
1602   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1603   /// Load: scalar load + broadcast.
1604   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1605   /// element)
1606   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1607 
1608   /// Estimate the overhead of scalarizing an instruction. This is a
1609   /// convenience wrapper for the type-based getScalarizationOverhead API.
1610   InstructionCost getScalarizationOverhead(Instruction *I,
1611                                            ElementCount VF) const;
1612 
1613   /// Returns true if an artificially high cost for emulated masked memrefs
1614   /// should be used.
1615   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1616 
1617   /// Map of scalar integer values to the smallest bitwidth they can be legally
1618   /// represented as. The vector equivalents of these values should be truncated
1619   /// to this type.
1620   MapVector<Instruction *, uint64_t> MinBWs;
1621 
1622   /// A type representing the costs for instructions if they were to be
1623   /// scalarized rather than vectorized. The entries are Instruction-Cost
1624   /// pairs.
1625   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1626 
1627   /// A set containing all BasicBlocks that are known to present after
1628   /// vectorization as a predicated block.
1629   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1630       PredicatedBBsAfterVectorization;
1631 
1632   /// Records whether it is allowed to have the original scalar loop execute at
1633   /// least once. This may be needed as a fallback loop in case runtime
1634   /// aliasing/dependence checks fail, or to handle the tail/remainder
1635   /// iterations when the trip count is unknown or doesn't divide by the VF,
1636   /// or as a peel-loop to handle gaps in interleave-groups.
1637   /// Under optsize and when the trip count is very small we don't allow any
1638   /// iterations to execute in the scalar loop.
1639   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1640 
1641   /// Control finally chosen tail folding style. The first element is used if
1642   /// the IV update may overflow, the second element - if it does not.
1643   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1644       ChosenTailFoldingStyle;
1645 
1646   /// true if scalable vectorization is supported and enabled.
1647   std::optional<bool> IsScalableVectorizationAllowed;
1648 
1649   /// Maximum safe number of elements to be processed per vector iteration,
1650   /// which do not prevent store-load forwarding and are safe with regard to the
1651   /// memory dependencies. Required for EVL-based veectorization, where this
1652   /// value is used as the upper bound of the safe AVL.
1653   std::optional<unsigned> MaxSafeElements;
1654 
1655   /// A map holding scalar costs for different vectorization factors. The
1656   /// presence of a cost for an instruction in the mapping indicates that the
1657   /// instruction will be scalarized when vectorizing with the associated
1658   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1659   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1660 
1661   /// Holds the instructions known to be uniform after vectorization.
1662   /// The data is collected per VF.
1663   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1664 
1665   /// Holds the instructions known to be scalar after vectorization.
1666   /// The data is collected per VF.
1667   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1668 
1669   /// Holds the instructions (address computations) that are forced to be
1670   /// scalarized.
1671   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1672 
1673   /// PHINodes of the reductions that should be expanded in-loop.
1674   SmallPtrSet<PHINode *, 4> InLoopReductions;
1675 
1676   /// A Map of inloop reduction operations and their immediate chain operand.
1677   /// FIXME: This can be removed once reductions can be costed correctly in
1678   /// VPlan. This was added to allow quick lookup of the inloop operations.
1679   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1680 
1681   /// Returns the expected difference in cost from scalarizing the expression
1682   /// feeding a predicated instruction \p PredInst. The instructions to
1683   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1684   /// non-negative return value implies the expression will be scalarized.
1685   /// Currently, only single-use chains are considered for scalarization.
1686   InstructionCost computePredInstDiscount(Instruction *PredInst,
1687                                           ScalarCostsTy &ScalarCosts,
1688                                           ElementCount VF);
1689 
1690   /// Collect the instructions that are uniform after vectorization. An
1691   /// instruction is uniform if we represent it with a single scalar value in
1692   /// the vectorized loop corresponding to each vector iteration. Examples of
1693   /// uniform instructions include pointer operands of consecutive or
1694   /// interleaved memory accesses. Note that although uniformity implies an
1695   /// instruction will be scalar, the reverse is not true. In general, a
1696   /// scalarized instruction will be represented by VF scalar values in the
1697   /// vectorized loop, each corresponding to an iteration of the original
1698   /// scalar loop.
1699   void collectLoopUniforms(ElementCount VF);
1700 
1701   /// Collect the instructions that are scalar after vectorization. An
1702   /// instruction is scalar if it is known to be uniform or will be scalarized
1703   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1704   /// to the list if they are used by a load/store instruction that is marked as
1705   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1706   /// VF values in the vectorized loop, each corresponding to an iteration of
1707   /// the original scalar loop.
1708   void collectLoopScalars(ElementCount VF);
1709 
1710   /// Keeps cost model vectorization decision and cost for instructions.
1711   /// Right now it is used for memory instructions only.
1712   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1713                                 std::pair<InstWidening, InstructionCost>>;
1714 
1715   DecisionList WideningDecisions;
1716 
1717   using CallDecisionList =
1718       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1719 
1720   CallDecisionList CallWideningDecisions;
1721 
1722   /// Returns true if \p V is expected to be vectorized and it needs to be
1723   /// extracted.
1724   bool needsExtract(Value *V, ElementCount VF) const {
1725     Instruction *I = dyn_cast<Instruction>(V);
1726     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1727         TheLoop->isLoopInvariant(I) ||
1728         getWideningDecision(I, VF) == CM_Scalarize)
1729       return false;
1730 
1731     // Assume we can vectorize V (and hence we need extraction) if the
1732     // scalars are not computed yet. This can happen, because it is called
1733     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1734     // the scalars are collected. That should be a safe assumption in most
1735     // cases, because we check if the operands have vectorizable types
1736     // beforehand in LoopVectorizationLegality.
1737     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1738   };
1739 
1740   /// Returns a range containing only operands needing to be extracted.
1741   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1742                                                    ElementCount VF) const {
1743     return SmallVector<Value *, 4>(make_filter_range(
1744         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1745   }
1746 
1747 public:
1748   /// The loop that we evaluate.
1749   Loop *TheLoop;
1750 
1751   /// Predicated scalar evolution analysis.
1752   PredicatedScalarEvolution &PSE;
1753 
1754   /// Loop Info analysis.
1755   LoopInfo *LI;
1756 
1757   /// Vectorization legality.
1758   LoopVectorizationLegality *Legal;
1759 
1760   /// Vector target information.
1761   const TargetTransformInfo &TTI;
1762 
1763   /// Target Library Info.
1764   const TargetLibraryInfo *TLI;
1765 
1766   /// Demanded bits analysis.
1767   DemandedBits *DB;
1768 
1769   /// Assumption cache.
1770   AssumptionCache *AC;
1771 
1772   /// Interface to emit optimization remarks.
1773   OptimizationRemarkEmitter *ORE;
1774 
1775   const Function *TheFunction;
1776 
1777   /// Loop Vectorize Hint.
1778   const LoopVectorizeHints *Hints;
1779 
1780   /// The interleave access information contains groups of interleaved accesses
1781   /// with the same stride and close to each other.
1782   InterleavedAccessInfo &InterleaveInfo;
1783 
1784   /// Values to ignore in the cost model.
1785   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1786 
1787   /// Values to ignore in the cost model when VF > 1.
1788   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1789 
1790   /// All element types found in the loop.
1791   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1792 
1793   /// The kind of cost that we are calculating
1794   TTI::TargetCostKind CostKind;
1795 };
1796 } // end namespace llvm
1797 
1798 namespace {
1799 /// Helper struct to manage generating runtime checks for vectorization.
1800 ///
1801 /// The runtime checks are created up-front in temporary blocks to allow better
1802 /// estimating the cost and un-linked from the existing IR. After deciding to
1803 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1804 /// temporary blocks are completely removed.
1805 class GeneratedRTChecks {
1806   /// Basic block which contains the generated SCEV checks, if any.
1807   BasicBlock *SCEVCheckBlock = nullptr;
1808 
1809   /// The value representing the result of the generated SCEV checks. If it is
1810   /// nullptr, either no SCEV checks have been generated or they have been used.
1811   Value *SCEVCheckCond = nullptr;
1812 
1813   /// Basic block which contains the generated memory runtime checks, if any.
1814   BasicBlock *MemCheckBlock = nullptr;
1815 
1816   /// The value representing the result of the generated memory runtime checks.
1817   /// If it is nullptr, either no memory runtime checks have been generated or
1818   /// they have been used.
1819   Value *MemRuntimeCheckCond = nullptr;
1820 
1821   DominatorTree *DT;
1822   LoopInfo *LI;
1823   TargetTransformInfo *TTI;
1824 
1825   SCEVExpander SCEVExp;
1826   SCEVExpander MemCheckExp;
1827 
1828   bool CostTooHigh = false;
1829   const bool AddBranchWeights;
1830 
1831   Loop *OuterLoop = nullptr;
1832 
1833   PredicatedScalarEvolution &PSE;
1834 
1835   /// The kind of cost that we are calculating
1836   TTI::TargetCostKind CostKind;
1837 
1838 public:
1839   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1840                     LoopInfo *LI, TargetTransformInfo *TTI,
1841                     const DataLayout &DL, bool AddBranchWeights,
1842                     TTI::TargetCostKind CostKind)
1843       : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1844         MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1845         AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1846 
1847   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1848   /// accurately estimate the cost of the runtime checks. The blocks are
1849   /// un-linked from the IR and are added back during vector code generation. If
1850   /// there is no vector code generation, the check blocks are removed
1851   /// completely.
1852   void create(Loop *L, const LoopAccessInfo &LAI,
1853               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1854 
1855     // Hard cutoff to limit compile-time increase in case a very large number of
1856     // runtime checks needs to be generated.
1857     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1858     // profile info.
1859     CostTooHigh =
1860         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1861     if (CostTooHigh)
1862       return;
1863 
1864     BasicBlock *LoopHeader = L->getHeader();
1865     BasicBlock *Preheader = L->getLoopPreheader();
1866 
1867     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1868     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1869     // may be used by SCEVExpander. The blocks will be un-linked from their
1870     // predecessors and removed from LI & DT at the end of the function.
1871     if (!UnionPred.isAlwaysTrue()) {
1872       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1873                                   nullptr, "vector.scevcheck");
1874 
1875       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1876           &UnionPred, SCEVCheckBlock->getTerminator());
1877     }
1878 
1879     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1880     if (RtPtrChecking.Need) {
1881       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1882       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1883                                  "vector.memcheck");
1884 
1885       auto DiffChecks = RtPtrChecking.getDiffChecks();
1886       if (DiffChecks) {
1887         Value *RuntimeVF = nullptr;
1888         MemRuntimeCheckCond = addDiffRuntimeChecks(
1889             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1890             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1891               if (!RuntimeVF)
1892                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1893               return RuntimeVF;
1894             },
1895             IC);
1896       } else {
1897         MemRuntimeCheckCond = addRuntimeChecks(
1898             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1899             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1900       }
1901       assert(MemRuntimeCheckCond &&
1902              "no RT checks generated although RtPtrChecking "
1903              "claimed checks are required");
1904     }
1905 
1906     if (!MemCheckBlock && !SCEVCheckBlock)
1907       return;
1908 
1909     // Unhook the temporary block with the checks, update various places
1910     // accordingly.
1911     if (SCEVCheckBlock)
1912       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1913     if (MemCheckBlock)
1914       MemCheckBlock->replaceAllUsesWith(Preheader);
1915 
1916     if (SCEVCheckBlock) {
1917       SCEVCheckBlock->getTerminator()->moveBefore(
1918           Preheader->getTerminator()->getIterator());
1919       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1920       Preheader->getTerminator()->eraseFromParent();
1921     }
1922     if (MemCheckBlock) {
1923       MemCheckBlock->getTerminator()->moveBefore(
1924           Preheader->getTerminator()->getIterator());
1925       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1926       Preheader->getTerminator()->eraseFromParent();
1927     }
1928 
1929     DT->changeImmediateDominator(LoopHeader, Preheader);
1930     if (MemCheckBlock) {
1931       DT->eraseNode(MemCheckBlock);
1932       LI->removeBlock(MemCheckBlock);
1933     }
1934     if (SCEVCheckBlock) {
1935       DT->eraseNode(SCEVCheckBlock);
1936       LI->removeBlock(SCEVCheckBlock);
1937     }
1938 
1939     // Outer loop is used as part of the later cost calculations.
1940     OuterLoop = L->getParentLoop();
1941   }
1942 
1943   InstructionCost getCost() {
1944     if (SCEVCheckBlock || MemCheckBlock)
1945       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1946 
1947     if (CostTooHigh) {
1948       InstructionCost Cost;
1949       Cost.setInvalid();
1950       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1951       return Cost;
1952     }
1953 
1954     InstructionCost RTCheckCost = 0;
1955     if (SCEVCheckBlock)
1956       for (Instruction &I : *SCEVCheckBlock) {
1957         if (SCEVCheckBlock->getTerminator() == &I)
1958           continue;
1959         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1960         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1961         RTCheckCost += C;
1962       }
1963     if (MemCheckBlock) {
1964       InstructionCost MemCheckCost = 0;
1965       for (Instruction &I : *MemCheckBlock) {
1966         if (MemCheckBlock->getTerminator() == &I)
1967           continue;
1968         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1969         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1970         MemCheckCost += C;
1971       }
1972 
1973       // If the runtime memory checks are being created inside an outer loop
1974       // we should find out if these checks are outer loop invariant. If so,
1975       // the checks will likely be hoisted out and so the effective cost will
1976       // reduce according to the outer loop trip count.
1977       if (OuterLoop) {
1978         ScalarEvolution *SE = MemCheckExp.getSE();
1979         // TODO: If profitable, we could refine this further by analysing every
1980         // individual memory check, since there could be a mixture of loop
1981         // variant and invariant checks that mean the final condition is
1982         // variant.
1983         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1984         if (SE->isLoopInvariant(Cond, OuterLoop)) {
1985           // It seems reasonable to assume that we can reduce the effective
1986           // cost of the checks even when we know nothing about the trip
1987           // count. Assume that the outer loop executes at least twice.
1988           unsigned BestTripCount = 2;
1989 
1990           // Get the best known TC estimate.
1991           if (auto EstimatedTC = getSmallBestKnownTC(
1992                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
1993             BestTripCount = *EstimatedTC;
1994 
1995           BestTripCount = std::max(BestTripCount, 1U);
1996           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1997 
1998           // Let's ensure the cost is always at least 1.
1999           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2000                                      (InstructionCost::CostType)1);
2001 
2002           if (BestTripCount > 1)
2003             LLVM_DEBUG(dbgs()
2004                        << "We expect runtime memory checks to be hoisted "
2005                        << "out of the outer loop. Cost reduced from "
2006                        << MemCheckCost << " to " << NewMemCheckCost << '\n');
2007 
2008           MemCheckCost = NewMemCheckCost;
2009         }
2010       }
2011 
2012       RTCheckCost += MemCheckCost;
2013     }
2014 
2015     if (SCEVCheckBlock || MemCheckBlock)
2016       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2017                         << "\n");
2018 
2019     return RTCheckCost;
2020   }
2021 
2022   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2023   /// unused.
2024   ~GeneratedRTChecks() {
2025     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2026     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2027     if (!SCEVCheckCond)
2028       SCEVCleaner.markResultUsed();
2029 
2030     if (!MemRuntimeCheckCond)
2031       MemCheckCleaner.markResultUsed();
2032 
2033     if (MemRuntimeCheckCond) {
2034       auto &SE = *MemCheckExp.getSE();
2035       // Memory runtime check generation creates compares that use expanded
2036       // values. Remove them before running the SCEVExpanderCleaners.
2037       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2038         if (MemCheckExp.isInsertedInstruction(&I))
2039           continue;
2040         SE.forgetValue(&I);
2041         I.eraseFromParent();
2042       }
2043     }
2044     MemCheckCleaner.cleanup();
2045     SCEVCleaner.cleanup();
2046 
2047     if (SCEVCheckCond)
2048       SCEVCheckBlock->eraseFromParent();
2049     if (MemRuntimeCheckCond)
2050       MemCheckBlock->eraseFromParent();
2051   }
2052 
2053   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2054   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2055   /// depending on the generated condition.
2056   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2057                              BasicBlock *LoopVectorPreHeader) {
2058     if (!SCEVCheckCond)
2059       return nullptr;
2060 
2061     Value *Cond = SCEVCheckCond;
2062     // Mark the check as used, to prevent it from being removed during cleanup.
2063     SCEVCheckCond = nullptr;
2064     if (auto *C = dyn_cast<ConstantInt>(Cond))
2065       if (C->isZero())
2066         return nullptr;
2067 
2068     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2069 
2070     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2071     // Create new preheader for vector loop.
2072     if (OuterLoop)
2073       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2074 
2075     SCEVCheckBlock->getTerminator()->eraseFromParent();
2076     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2077     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2078                                                 SCEVCheckBlock);
2079 
2080     DT->addNewBlock(SCEVCheckBlock, Pred);
2081     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2082 
2083     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2084     if (AddBranchWeights)
2085       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2086     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2087     return SCEVCheckBlock;
2088   }
2089 
2090   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2091   /// the branches to branch to the vector preheader or \p Bypass, depending on
2092   /// the generated condition.
2093   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2094                                    BasicBlock *LoopVectorPreHeader) {
2095     // Check if we generated code that checks in runtime if arrays overlap.
2096     if (!MemRuntimeCheckCond)
2097       return nullptr;
2098 
2099     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2100     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2101                                                 MemCheckBlock);
2102 
2103     DT->addNewBlock(MemCheckBlock, Pred);
2104     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2105     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2106 
2107     if (OuterLoop)
2108       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2109 
2110     BranchInst &BI =
2111         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2112     if (AddBranchWeights) {
2113       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2114     }
2115     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2116     MemCheckBlock->getTerminator()->setDebugLoc(
2117         Pred->getTerminator()->getDebugLoc());
2118 
2119     // Mark the check as used, to prevent it from being removed during cleanup.
2120     MemRuntimeCheckCond = nullptr;
2121     return MemCheckBlock;
2122   }
2123 };
2124 } // namespace
2125 
2126 static bool useActiveLaneMask(TailFoldingStyle Style) {
2127   return Style == TailFoldingStyle::Data ||
2128          Style == TailFoldingStyle::DataAndControlFlow ||
2129          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2130 }
2131 
2132 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2133   return Style == TailFoldingStyle::DataAndControlFlow ||
2134          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2135 }
2136 
2137 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2138 // vectorization. The loop needs to be annotated with #pragma omp simd
2139 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2140 // vector length information is not provided, vectorization is not considered
2141 // explicit. Interleave hints are not allowed either. These limitations will be
2142 // relaxed in the future.
2143 // Please, note that we are currently forced to abuse the pragma 'clang
2144 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2145 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2146 // provides *explicit vectorization hints* (LV can bypass legal checks and
2147 // assume that vectorization is legal). However, both hints are implemented
2148 // using the same metadata (llvm.loop.vectorize, processed by
2149 // LoopVectorizeHints). This will be fixed in the future when the native IR
2150 // representation for pragma 'omp simd' is introduced.
2151 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2152                                    OptimizationRemarkEmitter *ORE) {
2153   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2154   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2155 
2156   // Only outer loops with an explicit vectorization hint are supported.
2157   // Unannotated outer loops are ignored.
2158   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2159     return false;
2160 
2161   Function *Fn = OuterLp->getHeader()->getParent();
2162   if (!Hints.allowVectorization(Fn, OuterLp,
2163                                 true /*VectorizeOnlyWhenForced*/)) {
2164     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2165     return false;
2166   }
2167 
2168   if (Hints.getInterleave() > 1) {
2169     // TODO: Interleave support is future work.
2170     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2171                          "outer loops.\n");
2172     Hints.emitRemarkWithHints();
2173     return false;
2174   }
2175 
2176   return true;
2177 }
2178 
2179 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2180                                   OptimizationRemarkEmitter *ORE,
2181                                   SmallVectorImpl<Loop *> &V) {
2182   // Collect inner loops and outer loops without irreducible control flow. For
2183   // now, only collect outer loops that have explicit vectorization hints. If we
2184   // are stress testing the VPlan H-CFG construction, we collect the outermost
2185   // loop of every loop nest.
2186   if (L.isInnermost() || VPlanBuildStressTest ||
2187       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2188     LoopBlocksRPO RPOT(&L);
2189     RPOT.perform(LI);
2190     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2191       V.push_back(&L);
2192       // TODO: Collect inner loops inside marked outer loops in case
2193       // vectorization fails for the outer loop. Do not invoke
2194       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2195       // already known to be reducible. We can use an inherited attribute for
2196       // that.
2197       return;
2198     }
2199   }
2200   for (Loop *InnerL : L)
2201     collectSupportedLoops(*InnerL, LI, ORE, V);
2202 }
2203 
2204 //===----------------------------------------------------------------------===//
2205 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2206 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2207 //===----------------------------------------------------------------------===//
2208 
2209 /// Compute the transformed value of Index at offset StartValue using step
2210 /// StepValue.
2211 /// For integer induction, returns StartValue + Index * StepValue.
2212 /// For pointer induction, returns StartValue[Index * StepValue].
2213 /// FIXME: The newly created binary instructions should contain nsw/nuw
2214 /// flags, which can be found from the original scalar operations.
2215 static Value *
2216 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2217                      Value *Step,
2218                      InductionDescriptor::InductionKind InductionKind,
2219                      const BinaryOperator *InductionBinOp) {
2220   Type *StepTy = Step->getType();
2221   Value *CastedIndex = StepTy->isIntegerTy()
2222                            ? B.CreateSExtOrTrunc(Index, StepTy)
2223                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2224   if (CastedIndex != Index) {
2225     CastedIndex->setName(CastedIndex->getName() + ".cast");
2226     Index = CastedIndex;
2227   }
2228 
2229   // Note: the IR at this point is broken. We cannot use SE to create any new
2230   // SCEV and then expand it, hoping that SCEV's simplification will give us
2231   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2232   // lead to various SCEV crashes. So all we can do is to use builder and rely
2233   // on InstCombine for future simplifications. Here we handle some trivial
2234   // cases only.
2235   auto CreateAdd = [&B](Value *X, Value *Y) {
2236     assert(X->getType() == Y->getType() && "Types don't match!");
2237     if (auto *CX = dyn_cast<ConstantInt>(X))
2238       if (CX->isZero())
2239         return Y;
2240     if (auto *CY = dyn_cast<ConstantInt>(Y))
2241       if (CY->isZero())
2242         return X;
2243     return B.CreateAdd(X, Y);
2244   };
2245 
2246   // We allow X to be a vector type, in which case Y will potentially be
2247   // splatted into a vector with the same element count.
2248   auto CreateMul = [&B](Value *X, Value *Y) {
2249     assert(X->getType()->getScalarType() == Y->getType() &&
2250            "Types don't match!");
2251     if (auto *CX = dyn_cast<ConstantInt>(X))
2252       if (CX->isOne())
2253         return Y;
2254     if (auto *CY = dyn_cast<ConstantInt>(Y))
2255       if (CY->isOne())
2256         return X;
2257     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2258     if (XVTy && !isa<VectorType>(Y->getType()))
2259       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2260     return B.CreateMul(X, Y);
2261   };
2262 
2263   switch (InductionKind) {
2264   case InductionDescriptor::IK_IntInduction: {
2265     assert(!isa<VectorType>(Index->getType()) &&
2266            "Vector indices not supported for integer inductions yet");
2267     assert(Index->getType() == StartValue->getType() &&
2268            "Index type does not match StartValue type");
2269     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2270       return B.CreateSub(StartValue, Index);
2271     auto *Offset = CreateMul(Index, Step);
2272     return CreateAdd(StartValue, Offset);
2273   }
2274   case InductionDescriptor::IK_PtrInduction:
2275     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2276   case InductionDescriptor::IK_FpInduction: {
2277     assert(!isa<VectorType>(Index->getType()) &&
2278            "Vector indices not supported for FP inductions yet");
2279     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2280     assert(InductionBinOp &&
2281            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2282             InductionBinOp->getOpcode() == Instruction::FSub) &&
2283            "Original bin op should be defined for FP induction");
2284 
2285     Value *MulExp = B.CreateFMul(Step, Index);
2286     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2287                          "induction");
2288   }
2289   case InductionDescriptor::IK_NoInduction:
2290     return nullptr;
2291   }
2292   llvm_unreachable("invalid enum");
2293 }
2294 
2295 std::optional<unsigned> getMaxVScale(const Function &F,
2296                                      const TargetTransformInfo &TTI) {
2297   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2298     return MaxVScale;
2299 
2300   if (F.hasFnAttribute(Attribute::VScaleRange))
2301     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2302 
2303   return std::nullopt;
2304 }
2305 
2306 /// For the given VF and UF and maximum trip count computed for the loop, return
2307 /// whether the induction variable might overflow in the vectorized loop. If not,
2308 /// then we know a runtime overflow check always evaluates to false and can be
2309 /// removed.
2310 static bool isIndvarOverflowCheckKnownFalse(
2311     const LoopVectorizationCostModel *Cost,
2312     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2313   // Always be conservative if we don't know the exact unroll factor.
2314   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2315 
2316   Type *IdxTy = Cost->Legal->getWidestInductionType();
2317   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2318 
2319   // We know the runtime overflow check is known false iff the (max) trip-count
2320   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2321   // the vector loop induction variable.
2322   if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2323     uint64_t MaxVF = VF.getKnownMinValue();
2324     if (VF.isScalable()) {
2325       std::optional<unsigned> MaxVScale =
2326           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2327       if (!MaxVScale)
2328         return false;
2329       MaxVF *= *MaxVScale;
2330     }
2331 
2332     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2333   }
2334 
2335   return false;
2336 }
2337 
2338 // Return whether we allow using masked interleave-groups (for dealing with
2339 // strided loads/stores that reside in predicated blocks, or for dealing
2340 // with gaps).
2341 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2342   // If an override option has been passed in for interleaved accesses, use it.
2343   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2344     return EnableMaskedInterleavedMemAccesses;
2345 
2346   return TTI.enableMaskedInterleavedAccessVectorization();
2347 }
2348 
2349 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2350                                                VPReplicateRecipe *RepRecipe,
2351                                                const VPLane &Lane,
2352                                                VPTransformState &State) {
2353   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2354 
2355   // Does this instruction return a value ?
2356   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2357 
2358   Instruction *Cloned = Instr->clone();
2359   if (!IsVoidRetTy) {
2360     Cloned->setName(Instr->getName() + ".cloned");
2361 #if !defined(NDEBUG)
2362     // Verify that VPlan type inference results agree with the type of the
2363     // generated values.
2364     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2365            "inferred type and type from generated instructions do not match");
2366 #endif
2367   }
2368 
2369   RepRecipe->setFlags(Cloned);
2370 
2371   if (auto DL = Instr->getDebugLoc())
2372     State.setDebugLocFrom(DL);
2373 
2374   // Replace the operands of the cloned instructions with their scalar
2375   // equivalents in the new loop.
2376   for (const auto &I : enumerate(RepRecipe->operands())) {
2377     auto InputLane = Lane;
2378     VPValue *Operand = I.value();
2379     if (vputils::isUniformAfterVectorization(Operand))
2380       InputLane = VPLane::getFirstLane();
2381     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2382   }
2383   State.addNewMetadata(Cloned, Instr);
2384 
2385   // Place the cloned scalar in the new loop.
2386   State.Builder.Insert(Cloned);
2387 
2388   State.set(RepRecipe, Cloned, Lane);
2389 
2390   // If we just cloned a new assumption, add it the assumption cache.
2391   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2392     AC->registerAssumption(II);
2393 
2394   // End if-block.
2395   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2396   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2397   assert(
2398       (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2399        all_of(RepRecipe->operands(),
2400               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2401       "Expected a recipe is either within a region or all of its operands "
2402       "are defined outside the vectorized region.");
2403   if (IfPredicateInstr)
2404     PredicatedInstructions.push_back(Cloned);
2405 }
2406 
2407 Value *
2408 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2409   if (VectorTripCount)
2410     return VectorTripCount;
2411 
2412   Value *TC = getTripCount();
2413   IRBuilder<> Builder(InsertBlock->getTerminator());
2414 
2415   Type *Ty = TC->getType();
2416   // This is where we can make the step a runtime constant.
2417   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2418 
2419   // If the tail is to be folded by masking, round the number of iterations N
2420   // up to a multiple of Step instead of rounding down. This is done by first
2421   // adding Step-1 and then rounding down. Note that it's ok if this addition
2422   // overflows: the vector induction variable will eventually wrap to zero given
2423   // that it starts at zero and its Step is a power of two; the loop will then
2424   // exit, with the last early-exit vector comparison also producing all-true.
2425   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2426   // is accounted for in emitIterationCountCheck that adds an overflow check.
2427   if (Cost->foldTailByMasking()) {
2428     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2429            "VF*UF must be a power of 2 when folding tail by masking");
2430     TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2431                            "n.rnd.up");
2432   }
2433 
2434   // Now we need to generate the expression for the part of the loop that the
2435   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2436   // iterations are not required for correctness, or N - Step, otherwise. Step
2437   // is equal to the vectorization factor (number of SIMD elements) times the
2438   // unroll factor (number of SIMD instructions).
2439   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2440 
2441   // There are cases where we *must* run at least one iteration in the remainder
2442   // loop.  See the cost model for when this can happen.  If the step evenly
2443   // divides the trip count, we set the remainder to be equal to the step. If
2444   // the step does not evenly divide the trip count, no adjustment is necessary
2445   // since there will already be scalar iterations. Note that the minimum
2446   // iterations check ensures that N >= Step.
2447   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2448     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2449     R = Builder.CreateSelect(IsZero, Step, R);
2450   }
2451 
2452   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2453 
2454   return VectorTripCount;
2455 }
2456 
2457 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2458   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2459   VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2460   if (PreVectorPH->getNumSuccessors() != 1) {
2461     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2462     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2463            "Unexpected successor");
2464     VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2465     VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2466     PreVectorPH = CheckVPIRBB;
2467   }
2468   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2469   PreVectorPH->swapSuccessors();
2470 }
2471 
2472 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2473   Value *Count = getTripCount();
2474   // Reuse existing vector loop preheader for TC checks.
2475   // Note that new preheader block is generated for vector loop.
2476   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2477   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2478 
2479   // Generate code to check if the loop's trip count is less than VF * UF, or
2480   // equal to it in case a scalar epilogue is required; this implies that the
2481   // vector trip count is zero. This check also covers the case where adding one
2482   // to the backedge-taken count overflowed leading to an incorrect trip count
2483   // of zero. In this case we will also jump to the scalar loop.
2484   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2485                                                        : ICmpInst::ICMP_ULT;
2486 
2487   // If tail is to be folded, vector loop takes care of all iterations.
2488   Type *CountTy = Count->getType();
2489   Value *CheckMinIters = Builder.getFalse();
2490   auto CreateStep = [&]() -> Value * {
2491     // Create step with max(MinProTripCount, UF * VF).
2492     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2493       return createStepForVF(Builder, CountTy, VF, UF);
2494 
2495     Value *MinProfTC =
2496         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2497     if (!VF.isScalable())
2498       return MinProfTC;
2499     return Builder.CreateBinaryIntrinsic(
2500         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2501   };
2502 
2503   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2504   if (Style == TailFoldingStyle::None) {
2505     Value *Step = CreateStep();
2506     ScalarEvolution &SE = *PSE.getSE();
2507     // TODO: Emit unconditional branch to vector preheader instead of
2508     // conditional branch with known condition.
2509     const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2510     // Check if the trip count is < the step.
2511     if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2512       // TODO: Ensure step is at most the trip count when determining max VF and
2513       // UF, w/o tail folding.
2514       CheckMinIters = Builder.getTrue();
2515     } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),
2516                                     TripCountSCEV, SE.getSCEV(Step))) {
2517       // Generate the minimum iteration check only if we cannot prove the
2518       // check is known to be true, or known to be false.
2519       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2520     } // else step known to be < trip count, use CheckMinIters preset to false.
2521   } else if (VF.isScalable() &&
2522              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2523              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2524     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2525     // an overflow to zero when updating induction variables and so an
2526     // additional overflow check is required before entering the vector loop.
2527 
2528     // Get the maximum unsigned value for the type.
2529     Value *MaxUIntTripCount =
2530         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2531     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2532 
2533     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2534     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2535   }
2536 
2537   // Create new preheader for vector loop.
2538   LoopVectorPreHeader =
2539       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2540                  "vector.ph");
2541 
2542   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2543                                DT->getNode(Bypass)->getIDom()) &&
2544          "TC check is expected to dominate Bypass");
2545 
2546   BranchInst &BI =
2547       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2548   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2549     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2550   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2551   LoopBypassBlocks.push_back(TCCheckBlock);
2552 
2553   // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2554   introduceCheckBlockInVPlan(TCCheckBlock);
2555 }
2556 
2557 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2558   BasicBlock *const SCEVCheckBlock =
2559       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2560   if (!SCEVCheckBlock)
2561     return nullptr;
2562 
2563   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2564            (OptForSizeBasedOnProfile &&
2565             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2566          "Cannot SCEV check stride or overflow when optimizing for size");
2567   assert(!LoopBypassBlocks.empty() &&
2568          "Should already be a bypass block due to iteration count check");
2569   LoopBypassBlocks.push_back(SCEVCheckBlock);
2570   AddedSafetyChecks = true;
2571 
2572   introduceCheckBlockInVPlan(SCEVCheckBlock);
2573   return SCEVCheckBlock;
2574 }
2575 
2576 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2577   // VPlan-native path does not do any analysis for runtime checks currently.
2578   if (EnableVPlanNativePath)
2579     return nullptr;
2580 
2581   BasicBlock *const MemCheckBlock =
2582       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2583 
2584   // Check if we generated code that checks in runtime if arrays overlap. We put
2585   // the checks into a separate block to make the more common case of few
2586   // elements faster.
2587   if (!MemCheckBlock)
2588     return nullptr;
2589 
2590   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2591     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2592            "Cannot emit memory checks when optimizing for size, unless forced "
2593            "to vectorize.");
2594     ORE->emit([&]() {
2595       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2596                                         OrigLoop->getStartLoc(),
2597                                         OrigLoop->getHeader())
2598              << "Code-size may be reduced by not forcing "
2599                 "vectorization, or by source-code modifications "
2600                 "eliminating the need for runtime checks "
2601                 "(e.g., adding 'restrict').";
2602     });
2603   }
2604 
2605   LoopBypassBlocks.push_back(MemCheckBlock);
2606 
2607   AddedSafetyChecks = true;
2608 
2609   introduceCheckBlockInVPlan(MemCheckBlock);
2610   return MemCheckBlock;
2611 }
2612 
2613 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2614 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2615 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2616 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2617 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2618   VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2619   for (auto &R : make_early_inc_range(*VPBB)) {
2620     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2621     R.moveBefore(*IRVPBB, IRVPBB->end());
2622   }
2623 
2624   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2625   // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2626 }
2627 
2628 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2629   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2630   assert(LoopVectorPreHeader && "Invalid loop structure");
2631   assert((OrigLoop->getUniqueLatchExitBlock() ||
2632           Cost->requiresScalarEpilogue(VF.isVector())) &&
2633          "loops not exiting via the latch without required epilogue?");
2634 
2635   LoopMiddleBlock =
2636       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2637                  LI, nullptr, Twine(Prefix) + "middle.block");
2638   replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
2639   LoopScalarPreHeader =
2640       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2641                  nullptr, Twine(Prefix) + "scalar.ph");
2642   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
2643 }
2644 
2645 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2646 /// expansion results.
2647 static Value *getExpandedStep(const InductionDescriptor &ID,
2648                               const SCEV2ValueTy &ExpandedSCEVs) {
2649   const SCEV *Step = ID.getStep();
2650   if (auto *C = dyn_cast<SCEVConstant>(Step))
2651     return C->getValue();
2652   if (auto *U = dyn_cast<SCEVUnknown>(Step))
2653     return U->getValue();
2654   auto I = ExpandedSCEVs.find(Step);
2655   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2656   return I->second;
2657 }
2658 
2659 /// Knowing that loop \p L executes a single vector iteration, add instructions
2660 /// that will get simplified and thus should not have any cost to \p
2661 /// InstsToIgnore.
2662 static void addFullyUnrolledInstructionsToIgnore(
2663     Loop *L, const LoopVectorizationLegality::InductionList &IL,
2664     SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2665   auto *Cmp = L->getLatchCmpInst();
2666   if (Cmp)
2667     InstsToIgnore.insert(Cmp);
2668   for (const auto &KV : IL) {
2669     // Extract the key by hand so that it can be used in the lambda below.  Note
2670     // that captured structured bindings are a C++20 extension.
2671     const PHINode *IV = KV.first;
2672 
2673     // Get next iteration value of the induction variable.
2674     Instruction *IVInst =
2675         cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2676     if (all_of(IVInst->users(),
2677                [&](const User *U) { return U == IV || U == Cmp; }))
2678       InstsToIgnore.insert(IVInst);
2679   }
2680 }
2681 
2682 void InnerLoopVectorizer::createInductionAdditionalBypassValues(
2683     const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2684   assert(MainVectorTripCount && "Must have bypass information");
2685 
2686   Instruction *OldInduction = Legal->getPrimaryInduction();
2687   IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2688                             getAdditionalBypassBlock()->getFirstInsertionPt());
2689   for (const auto &InductionEntry : Legal->getInductionVars()) {
2690     PHINode *OrigPhi = InductionEntry.first;
2691     const InductionDescriptor &II = InductionEntry.second;
2692     Value *Step = getExpandedStep(II, ExpandedSCEVs);
2693     // For the primary induction the additional bypass end value is known.
2694     // Otherwise it is computed.
2695     Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2696     if (OrigPhi != OldInduction) {
2697       auto *BinOp = II.getInductionBinOp();
2698       // Fast-math-flags propagate from the original induction instruction.
2699       if (isa_and_nonnull<FPMathOperator>(BinOp))
2700         BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2701 
2702       // Compute the end value for the additional bypass.
2703       EndValueFromAdditionalBypass =
2704           emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2705                                II.getStartValue(), Step, II.getKind(), BinOp);
2706       EndValueFromAdditionalBypass->setName("ind.end");
2707     }
2708 
2709     // Store the bypass value here, as it needs to be added as operand to its
2710     // scalar preheader phi node after the epilogue skeleton has been created.
2711     // TODO: Directly add as extra operand to the VPResumePHI recipe.
2712     assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2713            "entry for OrigPhi already exits");
2714     Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2715   }
2716 }
2717 
2718 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2719     const SCEV2ValueTy &ExpandedSCEVs) {
2720   /*
2721    In this function we generate a new loop. The new loop will contain
2722    the vectorized instructions while the old loop will continue to run the
2723    scalar remainder.
2724 
2725        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2726      /  |      preheader are expanded here. Eventually all required SCEV
2727     /   |      expansion should happen here.
2728    /    v
2729   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2730   |  /  |
2731   | /   v
2732   ||   [ ]     <-- vector pre header.
2733   |/    |
2734   |     v
2735   |    [  ] \
2736   |    [  ]_|   <-- vector loop (created during VPlan execution).
2737   |     |
2738   |     v
2739   \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2740    |    |                       successors created during VPlan execution)
2741    \/   |
2742    /\   v
2743    | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
2744    |    |
2745  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
2746    |   [ ] \
2747    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue, header
2748    |    |          wrapped in VPIRBasicBlock).
2749     \   |
2750      \  v
2751       >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
2752    ...
2753    */
2754 
2755   // Create an empty vector loop, and prepare basic blocks for the runtime
2756   // checks.
2757   createVectorLoopSkeleton("");
2758 
2759   // Now, compare the new count to zero. If it is zero skip the vector loop and
2760   // jump to the scalar loop. This check also covers the case where the
2761   // backedge-taken count is uint##_max: adding one to it will overflow leading
2762   // to an incorrect trip count of zero. In this (rare) case we will also jump
2763   // to the scalar loop.
2764   emitIterationCountCheck(LoopScalarPreHeader);
2765 
2766   // Generate the code to check any assumptions that we've made for SCEV
2767   // expressions.
2768   emitSCEVChecks(LoopScalarPreHeader);
2769 
2770   // Generate the code that checks in runtime if arrays overlap. We put the
2771   // checks into a separate block to make the more common case of few elements
2772   // faster.
2773   emitMemRuntimeChecks(LoopScalarPreHeader);
2774 
2775   return LoopVectorPreHeader;
2776 }
2777 
2778 namespace {
2779 
2780 struct CSEDenseMapInfo {
2781   static bool canHandle(const Instruction *I) {
2782     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2783            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2784   }
2785 
2786   static inline Instruction *getEmptyKey() {
2787     return DenseMapInfo<Instruction *>::getEmptyKey();
2788   }
2789 
2790   static inline Instruction *getTombstoneKey() {
2791     return DenseMapInfo<Instruction *>::getTombstoneKey();
2792   }
2793 
2794   static unsigned getHashValue(const Instruction *I) {
2795     assert(canHandle(I) && "Unknown instruction!");
2796     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2797                                                            I->value_op_end()));
2798   }
2799 
2800   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2801     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2802         LHS == getTombstoneKey() || RHS == getTombstoneKey())
2803       return LHS == RHS;
2804     return LHS->isIdenticalTo(RHS);
2805   }
2806 };
2807 
2808 } // end anonymous namespace
2809 
2810 ///Perform cse of induction variable instructions.
2811 static void cse(BasicBlock *BB) {
2812   // Perform simple cse.
2813   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2814   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2815     if (!CSEDenseMapInfo::canHandle(&In))
2816       continue;
2817 
2818     // Check if we can replace this instruction with any of the
2819     // visited instructions.
2820     if (Instruction *V = CSEMap.lookup(&In)) {
2821       In.replaceAllUsesWith(V);
2822       In.eraseFromParent();
2823       continue;
2824     }
2825 
2826     CSEMap[&In] = &In;
2827   }
2828 }
2829 
2830 InstructionCost
2831 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2832                                               ElementCount VF) const {
2833   // We only need to calculate a cost if the VF is scalar; for actual vectors
2834   // we should already have a pre-calculated cost at each VF.
2835   if (!VF.isScalar())
2836     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2837 
2838   Type *RetTy = CI->getType();
2839   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2840     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2841       return *RedCost;
2842 
2843   SmallVector<Type *, 4> Tys;
2844   for (auto &ArgOp : CI->args())
2845     Tys.push_back(ArgOp->getType());
2846 
2847   InstructionCost ScalarCallCost =
2848       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2849 
2850   // If this is an intrinsic we may have a lower cost for it.
2851   if (getVectorIntrinsicIDForCall(CI, TLI)) {
2852     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2853     return std::min(ScalarCallCost, IntrinsicCost);
2854   }
2855   return ScalarCallCost;
2856 }
2857 
2858 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2859   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2860     return Elt;
2861   return VectorType::get(Elt, VF);
2862 }
2863 
2864 InstructionCost
2865 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2866                                                    ElementCount VF) const {
2867   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2868   assert(ID && "Expected intrinsic call!");
2869   Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2870   FastMathFlags FMF;
2871   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2872     FMF = FPMO->getFastMathFlags();
2873 
2874   SmallVector<const Value *> Arguments(CI->args());
2875   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2876   SmallVector<Type *> ParamTys;
2877   std::transform(FTy->param_begin(), FTy->param_end(),
2878                  std::back_inserter(ParamTys),
2879                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2880 
2881   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2882                                     dyn_cast<IntrinsicInst>(CI));
2883   return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2884 }
2885 
2886 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2887   // Fix widened non-induction PHIs by setting up the PHI operands.
2888   if (EnableVPlanNativePath)
2889     fixNonInductionPHIs(State);
2890 
2891   // Forget the original basic block.
2892   PSE.getSE()->forgetLoop(OrigLoop);
2893   PSE.getSE()->forgetBlockAndLoopDispositions();
2894 
2895   // After vectorization, the exit blocks of the original loop will have
2896   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2897   // looked through single-entry phis.
2898   SmallVector<BasicBlock *> ExitBlocks;
2899   OrigLoop->getExitBlocks(ExitBlocks);
2900   for (BasicBlock *Exit : ExitBlocks)
2901     for (PHINode &PN : Exit->phis())
2902       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2903 
2904   // Don't apply optimizations below when no vector region remains, as they all
2905   // require a vector loop at the moment.
2906   if (!State.Plan->getVectorLoopRegion())
2907     return;
2908 
2909   for (Instruction *PI : PredicatedInstructions)
2910     sinkScalarOperands(&*PI);
2911 
2912   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2913   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
2914   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2915 
2916   // Remove redundant induction instructions.
2917   cse(HeaderBB);
2918 
2919   // Set/update profile weights for the vector and remainder loops as original
2920   // loop iterations are now distributed among them. Note that original loop
2921   // becomes the scalar remainder loop after vectorization.
2922   //
2923   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2924   // end up getting slightly roughened result but that should be OK since
2925   // profile is not inherently precise anyway. Note also possible bypass of
2926   // vector code caused by legality checks is ignored, assigning all the weight
2927   // to the vector loop, optimistically.
2928   //
2929   // For scalable vectorization we can't know at compile time how many
2930   // iterations of the loop are handled in one vector iteration, so instead
2931   // assume a pessimistic vscale of '1'.
2932   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2933   setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
2934                                VF.getKnownMinValue() * UF);
2935 }
2936 
2937 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
2938   // The basic block and loop containing the predicated instruction.
2939   auto *PredBB = PredInst->getParent();
2940   auto *VectorLoop = LI->getLoopFor(PredBB);
2941 
2942   // Initialize a worklist with the operands of the predicated instruction.
2943   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
2944 
2945   // Holds instructions that we need to analyze again. An instruction may be
2946   // reanalyzed if we don't yet know if we can sink it or not.
2947   SmallVector<Instruction *, 8> InstsToReanalyze;
2948 
2949   // Returns true if a given use occurs in the predicated block. Phi nodes use
2950   // their operands in their corresponding predecessor blocks.
2951   auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
2952     auto *I = cast<Instruction>(U.getUser());
2953     BasicBlock *BB = I->getParent();
2954     if (auto *Phi = dyn_cast<PHINode>(I))
2955       BB = Phi->getIncomingBlock(
2956           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
2957     return BB == PredBB;
2958   };
2959 
2960   // Iteratively sink the scalarized operands of the predicated instruction
2961   // into the block we created for it. When an instruction is sunk, it's
2962   // operands are then added to the worklist. The algorithm ends after one pass
2963   // through the worklist doesn't sink a single instruction.
2964   bool Changed;
2965   do {
2966     // Add the instructions that need to be reanalyzed to the worklist, and
2967     // reset the changed indicator.
2968     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
2969     InstsToReanalyze.clear();
2970     Changed = false;
2971 
2972     while (!Worklist.empty()) {
2973       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
2974 
2975       // We can't sink an instruction if it is a phi node, is not in the loop,
2976       // may have side effects or may read from memory.
2977       // TODO: Could do more granular checking to allow sinking
2978       // a load past non-store instructions.
2979       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
2980           I->mayHaveSideEffects() || I->mayReadFromMemory())
2981           continue;
2982 
2983       // If the instruction is already in PredBB, check if we can sink its
2984       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
2985       // sinking the scalar instruction I, hence it appears in PredBB; but it
2986       // may have failed to sink I's operands (recursively), which we try
2987       // (again) here.
2988       if (I->getParent() == PredBB) {
2989         Worklist.insert(I->op_begin(), I->op_end());
2990         continue;
2991       }
2992 
2993       // It's legal to sink the instruction if all its uses occur in the
2994       // predicated block. Otherwise, there's nothing to do yet, and we may
2995       // need to reanalyze the instruction.
2996       if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
2997         InstsToReanalyze.push_back(I);
2998         continue;
2999       }
3000 
3001       // Move the instruction to the beginning of the predicated block, and add
3002       // it's operands to the worklist.
3003       I->moveBefore(PredBB->getFirstInsertionPt());
3004       Worklist.insert(I->op_begin(), I->op_end());
3005 
3006       // The sinking may have enabled other instructions to be sunk, so we will
3007       // need to iterate.
3008       Changed = true;
3009     }
3010   } while (Changed);
3011 }
3012 
3013 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
3014   auto Iter = vp_depth_first_deep(Plan.getEntry());
3015   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3016     for (VPRecipeBase &P : VPBB->phis()) {
3017       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3018       if (!VPPhi)
3019         continue;
3020       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3021       // Make sure the builder has a valid insert point.
3022       Builder.SetInsertPoint(NewPhi);
3023       for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3024         VPValue *Inc = VPPhi->getIncomingValue(Idx);
3025         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3026         NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3027       }
3028     }
3029   }
3030 }
3031 
3032 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3033   // We should not collect Scalars more than once per VF. Right now, this
3034   // function is called from collectUniformsAndScalars(), which already does
3035   // this check. Collecting Scalars for VF=1 does not make any sense.
3036   assert(VF.isVector() && !Scalars.contains(VF) &&
3037          "This function should not be visited twice for the same VF");
3038 
3039   // This avoids any chances of creating a REPLICATE recipe during planning
3040   // since that would result in generation of scalarized code during execution,
3041   // which is not supported for scalable vectors.
3042   if (VF.isScalable()) {
3043     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3044     return;
3045   }
3046 
3047   SmallSetVector<Instruction *, 8> Worklist;
3048 
3049   // These sets are used to seed the analysis with pointers used by memory
3050   // accesses that will remain scalar.
3051   SmallSetVector<Instruction *, 8> ScalarPtrs;
3052   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3053   auto *Latch = TheLoop->getLoopLatch();
3054 
3055   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3056   // The pointer operands of loads and stores will be scalar as long as the
3057   // memory access is not a gather or scatter operation. The value operand of a
3058   // store will remain scalar if the store is scalarized.
3059   auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3060     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3061     assert(WideningDecision != CM_Unknown &&
3062            "Widening decision should be ready at this moment");
3063     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3064       if (Ptr == Store->getValueOperand())
3065         return WideningDecision == CM_Scalarize;
3066     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3067            "Ptr is neither a value or pointer operand");
3068     return WideningDecision != CM_GatherScatter;
3069   };
3070 
3071   // A helper that returns true if the given value is a getelementptr
3072   // instruction contained in the loop.
3073   auto IsLoopVaryingGEP = [&](Value *V) {
3074     return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3075   };
3076 
3077   // A helper that evaluates a memory access's use of a pointer. If the use will
3078   // be a scalar use and the pointer is only used by memory accesses, we place
3079   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3080   // PossibleNonScalarPtrs.
3081   auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3082     // We only care about bitcast and getelementptr instructions contained in
3083     // the loop.
3084     if (!IsLoopVaryingGEP(Ptr))
3085       return;
3086 
3087     // If the pointer has already been identified as scalar (e.g., if it was
3088     // also identified as uniform), there's nothing to do.
3089     auto *I = cast<Instruction>(Ptr);
3090     if (Worklist.count(I))
3091       return;
3092 
3093     // If the use of the pointer will be a scalar use, and all users of the
3094     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3095     // place the pointer in PossibleNonScalarPtrs.
3096     if (IsScalarUse(MemAccess, Ptr) &&
3097         all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3098       ScalarPtrs.insert(I);
3099     else
3100       PossibleNonScalarPtrs.insert(I);
3101   };
3102 
3103   // We seed the scalars analysis with three classes of instructions: (1)
3104   // instructions marked uniform-after-vectorization and (2) bitcast,
3105   // getelementptr and (pointer) phi instructions used by memory accesses
3106   // requiring a scalar use.
3107   //
3108   // (1) Add to the worklist all instructions that have been identified as
3109   // uniform-after-vectorization.
3110   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3111 
3112   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3113   // memory accesses requiring a scalar use. The pointer operands of loads and
3114   // stores will be scalar unless the operation is a gather or scatter.
3115   // The value operand of a store will remain scalar if the store is scalarized.
3116   for (auto *BB : TheLoop->blocks())
3117     for (auto &I : *BB) {
3118       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3119         EvaluatePtrUse(Load, Load->getPointerOperand());
3120       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3121         EvaluatePtrUse(Store, Store->getPointerOperand());
3122         EvaluatePtrUse(Store, Store->getValueOperand());
3123       }
3124     }
3125   for (auto *I : ScalarPtrs)
3126     if (!PossibleNonScalarPtrs.count(I)) {
3127       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3128       Worklist.insert(I);
3129     }
3130 
3131   // Insert the forced scalars.
3132   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3133   // induction variable when the PHI user is scalarized.
3134   auto ForcedScalar = ForcedScalars.find(VF);
3135   if (ForcedScalar != ForcedScalars.end())
3136     for (auto *I : ForcedScalar->second) {
3137       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3138       Worklist.insert(I);
3139     }
3140 
3141   // Expand the worklist by looking through any bitcasts and getelementptr
3142   // instructions we've already identified as scalar. This is similar to the
3143   // expansion step in collectLoopUniforms(); however, here we're only
3144   // expanding to include additional bitcasts and getelementptr instructions.
3145   unsigned Idx = 0;
3146   while (Idx != Worklist.size()) {
3147     Instruction *Dst = Worklist[Idx++];
3148     if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3149       continue;
3150     auto *Src = cast<Instruction>(Dst->getOperand(0));
3151     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3152           auto *J = cast<Instruction>(U);
3153           return !TheLoop->contains(J) || Worklist.count(J) ||
3154                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3155                   IsScalarUse(J, Src));
3156         })) {
3157       Worklist.insert(Src);
3158       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3159     }
3160   }
3161 
3162   // An induction variable will remain scalar if all users of the induction
3163   // variable and induction variable update remain scalar.
3164   for (const auto &Induction : Legal->getInductionVars()) {
3165     auto *Ind = Induction.first;
3166     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3167 
3168     // If tail-folding is applied, the primary induction variable will be used
3169     // to feed a vector compare.
3170     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3171       continue;
3172 
3173     // Returns true if \p Indvar is a pointer induction that is used directly by
3174     // load/store instruction \p I.
3175     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3176                                               Instruction *I) {
3177       return Induction.second.getKind() ==
3178                  InductionDescriptor::IK_PtrInduction &&
3179              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3180              Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3181     };
3182 
3183     // Determine if all users of the induction variable are scalar after
3184     // vectorization.
3185     bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3186       auto *I = cast<Instruction>(U);
3187       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3188              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3189     });
3190     if (!ScalarInd)
3191       continue;
3192 
3193     // If the induction variable update is a fixed-order recurrence, neither the
3194     // induction variable or its update should be marked scalar after
3195     // vectorization.
3196     auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3197     if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3198       continue;
3199 
3200     // Determine if all users of the induction variable update instruction are
3201     // scalar after vectorization.
3202     bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3203       auto *I = cast<Instruction>(U);
3204       return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3205              IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3206     });
3207     if (!ScalarIndUpdate)
3208       continue;
3209 
3210     // The induction variable and its update instruction will remain scalar.
3211     Worklist.insert(Ind);
3212     Worklist.insert(IndUpdate);
3213     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3214     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3215                       << "\n");
3216   }
3217 
3218   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3219 }
3220 
3221 bool LoopVectorizationCostModel::isScalarWithPredication(
3222     Instruction *I, ElementCount VF) const {
3223   if (!isPredicatedInst(I))
3224     return false;
3225 
3226   // Do we have a non-scalar lowering for this predicated
3227   // instruction? No - it is scalar with predication.
3228   switch(I->getOpcode()) {
3229   default:
3230     return true;
3231   case Instruction::Call:
3232     if (VF.isScalar())
3233       return true;
3234     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3235                .Kind == CM_Scalarize;
3236   case Instruction::Load:
3237   case Instruction::Store: {
3238     auto *Ptr = getLoadStorePointerOperand(I);
3239     auto *Ty = getLoadStoreType(I);
3240     Type *VTy = Ty;
3241     if (VF.isVector())
3242       VTy = VectorType::get(Ty, VF);
3243     const Align Alignment = getLoadStoreAlignment(I);
3244     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3245                                 TTI.isLegalMaskedGather(VTy, Alignment))
3246                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3247                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3248   }
3249   case Instruction::UDiv:
3250   case Instruction::SDiv:
3251   case Instruction::SRem:
3252   case Instruction::URem: {
3253     // We have the option to use the safe-divisor idiom to avoid predication.
3254     // The cost based decision here will always select safe-divisor for
3255     // scalable vectors as scalarization isn't legal.
3256     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3257     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3258   }
3259   }
3260 }
3261 
3262 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3263 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3264   // If predication is not needed, avoid it.
3265   // TODO: We can use the loop-preheader as context point here and get
3266   // context sensitive reasoning for isSafeToSpeculativelyExecute.
3267   if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3268       isSafeToSpeculativelyExecute(I) ||
3269       (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3270       isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3271     return false;
3272 
3273   // If the instruction was executed conditionally in the original scalar loop,
3274   // predication is needed with a mask whose lanes are all possibly inactive.
3275   if (Legal->blockNeedsPredication(I->getParent()))
3276     return true;
3277 
3278   // All that remain are instructions with side-effects originally executed in
3279   // the loop unconditionally, but now execute under a tail-fold mask (only)
3280   // having at least one active lane (the first). If the side-effects of the
3281   // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3282   // - it will cause the same side-effects as when masked.
3283   switch(I->getOpcode()) {
3284   default:
3285     llvm_unreachable(
3286         "instruction should have been considered by earlier checks");
3287   case Instruction::Call:
3288     // Side-effects of a Call are assumed to be non-invariant, needing a
3289     // (fold-tail) mask.
3290     assert(Legal->isMaskRequired(I) &&
3291            "should have returned earlier for calls not needing a mask");
3292     return true;
3293   case Instruction::Load:
3294     // If the address is loop invariant no predication is needed.
3295     return !Legal->isInvariant(getLoadStorePointerOperand(I));
3296   case Instruction::Store: {
3297     // For stores, we need to prove both speculation safety (which follows from
3298     // the same argument as loads), but also must prove the value being stored
3299     // is correct.  The easiest form of the later is to require that all values
3300     // stored are the same.
3301     return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3302              TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3303   }
3304   case Instruction::UDiv:
3305   case Instruction::SDiv:
3306   case Instruction::SRem:
3307   case Instruction::URem:
3308     // If the divisor is loop-invariant no predication is needed.
3309     return !TheLoop->isLoopInvariant(I->getOperand(1));
3310   }
3311 }
3312 
3313 std::pair<InstructionCost, InstructionCost>
3314 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3315                                                     ElementCount VF) const {
3316   assert(I->getOpcode() == Instruction::UDiv ||
3317          I->getOpcode() == Instruction::SDiv ||
3318          I->getOpcode() == Instruction::SRem ||
3319          I->getOpcode() == Instruction::URem);
3320   assert(!isSafeToSpeculativelyExecute(I));
3321 
3322   // Scalarization isn't legal for scalable vector types
3323   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3324   if (!VF.isScalable()) {
3325     // Get the scalarization cost and scale this amount by the probability of
3326     // executing the predicated block. If the instruction is not predicated,
3327     // we fall through to the next case.
3328     ScalarizationCost = 0;
3329 
3330     // These instructions have a non-void type, so account for the phi nodes
3331     // that we will create. This cost is likely to be zero. The phi node
3332     // cost, if any, should be scaled by the block probability because it
3333     // models a copy at the end of each predicated block.
3334     ScalarizationCost += VF.getKnownMinValue() *
3335       TTI.getCFInstrCost(Instruction::PHI, CostKind);
3336 
3337     // The cost of the non-predicated instruction.
3338     ScalarizationCost += VF.getKnownMinValue() *
3339       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3340 
3341     // The cost of insertelement and extractelement instructions needed for
3342     // scalarization.
3343     ScalarizationCost += getScalarizationOverhead(I, VF);
3344 
3345     // Scale the cost by the probability of executing the predicated blocks.
3346     // This assumes the predicated block for each vector lane is equally
3347     // likely.
3348     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3349   }
3350   InstructionCost SafeDivisorCost = 0;
3351 
3352   auto *VecTy = toVectorTy(I->getType(), VF);
3353 
3354   // The cost of the select guard to ensure all lanes are well defined
3355   // after we speculate above any internal control flow.
3356   SafeDivisorCost +=
3357       TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3358                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3359                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
3360 
3361   // Certain instructions can be cheaper to vectorize if they have a constant
3362   // second vector operand. One example of this are shifts on x86.
3363   Value *Op2 = I->getOperand(1);
3364   auto Op2Info = TTI.getOperandInfo(Op2);
3365   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3366       Legal->isInvariant(Op2))
3367     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3368 
3369   SmallVector<const Value *, 4> Operands(I->operand_values());
3370   SafeDivisorCost += TTI.getArithmeticInstrCost(
3371     I->getOpcode(), VecTy, CostKind,
3372     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3373     Op2Info, Operands, I);
3374   return {ScalarizationCost, SafeDivisorCost};
3375 }
3376 
3377 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3378     Instruction *I, ElementCount VF) const {
3379   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3380   assert(getWideningDecision(I, VF) == CM_Unknown &&
3381          "Decision should not be set yet.");
3382   auto *Group = getInterleavedAccessGroup(I);
3383   assert(Group && "Must have a group.");
3384   unsigned InterleaveFactor = Group->getFactor();
3385 
3386   // If the instruction's allocated size doesn't equal its type size, it
3387   // requires padding and will be scalarized.
3388   auto &DL = I->getDataLayout();
3389   auto *ScalarTy = getLoadStoreType(I);
3390   if (hasIrregularType(ScalarTy, DL))
3391     return false;
3392 
3393   // We currently only know how to emit interleave/deinterleave with
3394   // Factor=2 for scalable vectors. This is purely an implementation
3395   // limit.
3396   if (VF.isScalable() && InterleaveFactor != 2)
3397     return false;
3398 
3399   // If the group involves a non-integral pointer, we may not be able to
3400   // losslessly cast all values to a common type.
3401   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3402   for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3403     Instruction *Member = Group->getMember(Idx);
3404     if (!Member)
3405       continue;
3406     auto *MemberTy = getLoadStoreType(Member);
3407     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3408     // Don't coerce non-integral pointers to integers or vice versa.
3409     if (MemberNI != ScalarNI)
3410       // TODO: Consider adding special nullptr value case here
3411       return false;
3412     if (MemberNI && ScalarNI &&
3413         ScalarTy->getPointerAddressSpace() !=
3414             MemberTy->getPointerAddressSpace())
3415       return false;
3416   }
3417 
3418   // Check if masking is required.
3419   // A Group may need masking for one of two reasons: it resides in a block that
3420   // needs predication, or it was decided to use masking to deal with gaps
3421   // (either a gap at the end of a load-access that may result in a speculative
3422   // load, or any gaps in a store-access).
3423   bool PredicatedAccessRequiresMasking =
3424       blockNeedsPredicationForAnyReason(I->getParent()) &&
3425       Legal->isMaskRequired(I);
3426   bool LoadAccessWithGapsRequiresEpilogMasking =
3427       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3428       !isScalarEpilogueAllowed();
3429   bool StoreAccessWithGapsRequiresMasking =
3430       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3431   if (!PredicatedAccessRequiresMasking &&
3432       !LoadAccessWithGapsRequiresEpilogMasking &&
3433       !StoreAccessWithGapsRequiresMasking)
3434     return true;
3435 
3436   // If masked interleaving is required, we expect that the user/target had
3437   // enabled it, because otherwise it either wouldn't have been created or
3438   // it should have been invalidated by the CostModel.
3439   assert(useMaskedInterleavedAccesses(TTI) &&
3440          "Masked interleave-groups for predicated accesses are not enabled.");
3441 
3442   if (Group->isReverse())
3443     return false;
3444 
3445   auto *Ty = getLoadStoreType(I);
3446   const Align Alignment = getLoadStoreAlignment(I);
3447   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3448                           : TTI.isLegalMaskedStore(Ty, Alignment);
3449 }
3450 
3451 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3452     Instruction *I, ElementCount VF) {
3453   // Get and ensure we have a valid memory instruction.
3454   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3455 
3456   auto *Ptr = getLoadStorePointerOperand(I);
3457   auto *ScalarTy = getLoadStoreType(I);
3458 
3459   // In order to be widened, the pointer should be consecutive, first of all.
3460   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3461     return false;
3462 
3463   // If the instruction is a store located in a predicated block, it will be
3464   // scalarized.
3465   if (isScalarWithPredication(I, VF))
3466     return false;
3467 
3468   // If the instruction's allocated size doesn't equal it's type size, it
3469   // requires padding and will be scalarized.
3470   auto &DL = I->getDataLayout();
3471   if (hasIrregularType(ScalarTy, DL))
3472     return false;
3473 
3474   return true;
3475 }
3476 
3477 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3478   // We should not collect Uniforms more than once per VF. Right now,
3479   // this function is called from collectUniformsAndScalars(), which
3480   // already does this check. Collecting Uniforms for VF=1 does not make any
3481   // sense.
3482 
3483   assert(VF.isVector() && !Uniforms.contains(VF) &&
3484          "This function should not be visited twice for the same VF");
3485 
3486   // Visit the list of Uniforms. If we find no uniform value, we won't
3487   // analyze again.  Uniforms.count(VF) will return 1.
3488   Uniforms[VF].clear();
3489 
3490   // Now we know that the loop is vectorizable!
3491   // Collect instructions inside the loop that will remain uniform after
3492   // vectorization.
3493 
3494   // Global values, params and instructions outside of current loop are out of
3495   // scope.
3496   auto IsOutOfScope = [&](Value *V) -> bool {
3497     Instruction *I = dyn_cast<Instruction>(V);
3498     return (!I || !TheLoop->contains(I));
3499   };
3500 
3501   // Worklist containing uniform instructions demanding lane 0.
3502   SetVector<Instruction *> Worklist;
3503 
3504   // Add uniform instructions demanding lane 0 to the worklist. Instructions
3505   // that require predication must not be considered uniform after
3506   // vectorization, because that would create an erroneous replicating region
3507   // where only a single instance out of VF should be formed.
3508   auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3509     if (IsOutOfScope(I)) {
3510       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3511                         << *I << "\n");
3512       return;
3513     }
3514     if (isPredicatedInst(I)) {
3515       LLVM_DEBUG(
3516           dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3517                  << "\n");
3518       return;
3519     }
3520     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3521     Worklist.insert(I);
3522   };
3523 
3524   // Start with the conditional branches exiting the loop. If the branch
3525   // condition is an instruction contained in the loop that is only used by the
3526   // branch, it is uniform. Note conditions from uncountable early exits are not
3527   // uniform.
3528   SmallVector<BasicBlock *> Exiting;
3529   TheLoop->getExitingBlocks(Exiting);
3530   for (BasicBlock *E : Exiting) {
3531     if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3532       continue;
3533     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3534     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3535       AddToWorklistIfAllowed(Cmp);
3536   }
3537 
3538   auto PrevVF = VF.divideCoefficientBy(2);
3539   // Return true if all lanes perform the same memory operation, and we can
3540   // thus choose to execute only one.
3541   auto IsUniformMemOpUse = [&](Instruction *I) {
3542     // If the value was already known to not be uniform for the previous
3543     // (smaller VF), it cannot be uniform for the larger VF.
3544     if (PrevVF.isVector()) {
3545       auto Iter = Uniforms.find(PrevVF);
3546       if (Iter != Uniforms.end() && !Iter->second.contains(I))
3547         return false;
3548     }
3549     if (!Legal->isUniformMemOp(*I, VF))
3550       return false;
3551     if (isa<LoadInst>(I))
3552       // Loading the same address always produces the same result - at least
3553       // assuming aliasing and ordering which have already been checked.
3554       return true;
3555     // Storing the same value on every iteration.
3556     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3557   };
3558 
3559   auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3560     InstWidening WideningDecision = getWideningDecision(I, VF);
3561     assert(WideningDecision != CM_Unknown &&
3562            "Widening decision should be ready at this moment");
3563 
3564     if (IsUniformMemOpUse(I))
3565       return true;
3566 
3567     return (WideningDecision == CM_Widen ||
3568             WideningDecision == CM_Widen_Reverse ||
3569             WideningDecision == CM_Interleave);
3570   };
3571 
3572   // Returns true if Ptr is the pointer operand of a memory access instruction
3573   // I, I is known to not require scalarization, and the pointer is not also
3574   // stored.
3575   auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3576     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3577       return false;
3578     return getLoadStorePointerOperand(I) == Ptr &&
3579            (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3580   };
3581 
3582   // Holds a list of values which are known to have at least one uniform use.
3583   // Note that there may be other uses which aren't uniform.  A "uniform use"
3584   // here is something which only demands lane 0 of the unrolled iterations;
3585   // it does not imply that all lanes produce the same value (e.g. this is not
3586   // the usual meaning of uniform)
3587   SetVector<Value *> HasUniformUse;
3588 
3589   // Scan the loop for instructions which are either a) known to have only
3590   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3591   for (auto *BB : TheLoop->blocks())
3592     for (auto &I : *BB) {
3593       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3594         switch (II->getIntrinsicID()) {
3595         case Intrinsic::sideeffect:
3596         case Intrinsic::experimental_noalias_scope_decl:
3597         case Intrinsic::assume:
3598         case Intrinsic::lifetime_start:
3599         case Intrinsic::lifetime_end:
3600           if (TheLoop->hasLoopInvariantOperands(&I))
3601             AddToWorklistIfAllowed(&I);
3602           break;
3603         default:
3604           break;
3605         }
3606       }
3607 
3608       // ExtractValue instructions must be uniform, because the operands are
3609       // known to be loop-invariant.
3610       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3611         assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3612                "Expected aggregate value to be loop invariant");
3613         AddToWorklistIfAllowed(EVI);
3614         continue;
3615       }
3616 
3617       // If there's no pointer operand, there's nothing to do.
3618       auto *Ptr = getLoadStorePointerOperand(&I);
3619       if (!Ptr)
3620         continue;
3621 
3622       if (IsUniformMemOpUse(&I))
3623         AddToWorklistIfAllowed(&I);
3624 
3625       if (IsVectorizedMemAccessUse(&I, Ptr))
3626         HasUniformUse.insert(Ptr);
3627     }
3628 
3629   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3630   // demanding) users.  Since loops are assumed to be in LCSSA form, this
3631   // disallows uses outside the loop as well.
3632   for (auto *V : HasUniformUse) {
3633     if (IsOutOfScope(V))
3634       continue;
3635     auto *I = cast<Instruction>(V);
3636     bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3637       auto *UI = cast<Instruction>(U);
3638       return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3639     });
3640     if (UsersAreMemAccesses)
3641       AddToWorklistIfAllowed(I);
3642   }
3643 
3644   // Expand Worklist in topological order: whenever a new instruction
3645   // is added , its users should be already inside Worklist.  It ensures
3646   // a uniform instruction will only be used by uniform instructions.
3647   unsigned Idx = 0;
3648   while (Idx != Worklist.size()) {
3649     Instruction *I = Worklist[Idx++];
3650 
3651     for (auto *OV : I->operand_values()) {
3652       // isOutOfScope operands cannot be uniform instructions.
3653       if (IsOutOfScope(OV))
3654         continue;
3655       // First order recurrence Phi's should typically be considered
3656       // non-uniform.
3657       auto *OP = dyn_cast<PHINode>(OV);
3658       if (OP && Legal->isFixedOrderRecurrence(OP))
3659         continue;
3660       // If all the users of the operand are uniform, then add the
3661       // operand into the uniform worklist.
3662       auto *OI = cast<Instruction>(OV);
3663       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3664             auto *J = cast<Instruction>(U);
3665             return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3666           }))
3667         AddToWorklistIfAllowed(OI);
3668     }
3669   }
3670 
3671   // For an instruction to be added into Worklist above, all its users inside
3672   // the loop should also be in Worklist. However, this condition cannot be
3673   // true for phi nodes that form a cyclic dependence. We must process phi
3674   // nodes separately. An induction variable will remain uniform if all users
3675   // of the induction variable and induction variable update remain uniform.
3676   // The code below handles both pointer and non-pointer induction variables.
3677   BasicBlock *Latch = TheLoop->getLoopLatch();
3678   for (const auto &Induction : Legal->getInductionVars()) {
3679     auto *Ind = Induction.first;
3680     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3681 
3682     // Determine if all users of the induction variable are uniform after
3683     // vectorization.
3684     bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3685       auto *I = cast<Instruction>(U);
3686       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3687              IsVectorizedMemAccessUse(I, Ind);
3688     });
3689     if (!UniformInd)
3690       continue;
3691 
3692     // Determine if all users of the induction variable update instruction are
3693     // uniform after vectorization.
3694     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3695       auto *I = cast<Instruction>(U);
3696       return I == Ind || Worklist.count(I) ||
3697              IsVectorizedMemAccessUse(I, IndUpdate);
3698     });
3699     if (!UniformIndUpdate)
3700       continue;
3701 
3702     // The induction variable and its update instruction will remain uniform.
3703     AddToWorklistIfAllowed(Ind);
3704     AddToWorklistIfAllowed(IndUpdate);
3705   }
3706 
3707   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3708 }
3709 
3710 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3711   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3712 
3713   if (Legal->getRuntimePointerChecking()->Need) {
3714     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3715         "runtime pointer checks needed. Enable vectorization of this "
3716         "loop with '#pragma clang loop vectorize(enable)' when "
3717         "compiling with -Os/-Oz",
3718         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3719     return true;
3720   }
3721 
3722   if (!PSE.getPredicate().isAlwaysTrue()) {
3723     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3724         "runtime SCEV checks needed. Enable vectorization of this "
3725         "loop with '#pragma clang loop vectorize(enable)' when "
3726         "compiling with -Os/-Oz",
3727         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3728     return true;
3729   }
3730 
3731   // FIXME: Avoid specializing for stride==1 instead of bailing out.
3732   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3733     reportVectorizationFailure("Runtime stride check for small trip count",
3734         "runtime stride == 1 checks needed. Enable vectorization of "
3735         "this loop without such check by compiling with -Os/-Oz",
3736         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3737     return true;
3738   }
3739 
3740   return false;
3741 }
3742 
3743 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3744   if (IsScalableVectorizationAllowed)
3745     return *IsScalableVectorizationAllowed;
3746 
3747   IsScalableVectorizationAllowed = false;
3748   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3749     return false;
3750 
3751   if (Hints->isScalableVectorizationDisabled()) {
3752     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3753                             "ScalableVectorizationDisabled", ORE, TheLoop);
3754     return false;
3755   }
3756 
3757   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3758 
3759   auto MaxScalableVF = ElementCount::getScalable(
3760       std::numeric_limits<ElementCount::ScalarTy>::max());
3761 
3762   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3763   // FIXME: While for scalable vectors this is currently sufficient, this should
3764   // be replaced by a more detailed mechanism that filters out specific VFs,
3765   // instead of invalidating vectorization for a whole set of VFs based on the
3766   // MaxVF.
3767 
3768   // Disable scalable vectorization if the loop contains unsupported reductions.
3769   if (!canVectorizeReductions(MaxScalableVF)) {
3770     reportVectorizationInfo(
3771         "Scalable vectorization not supported for the reduction "
3772         "operations found in this loop.",
3773         "ScalableVFUnfeasible", ORE, TheLoop);
3774     return false;
3775   }
3776 
3777   // Disable scalable vectorization if the loop contains any instructions
3778   // with element types not supported for scalable vectors.
3779   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3780         return !Ty->isVoidTy() &&
3781                !this->TTI.isElementTypeLegalForScalableVector(Ty);
3782       })) {
3783     reportVectorizationInfo("Scalable vectorization is not supported "
3784                             "for all element types found in this loop.",
3785                             "ScalableVFUnfeasible", ORE, TheLoop);
3786     return false;
3787   }
3788 
3789   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3790     reportVectorizationInfo("The target does not provide maximum vscale value "
3791                             "for safe distance analysis.",
3792                             "ScalableVFUnfeasible", ORE, TheLoop);
3793     return false;
3794   }
3795 
3796   IsScalableVectorizationAllowed = true;
3797   return true;
3798 }
3799 
3800 ElementCount
3801 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3802   if (!isScalableVectorizationAllowed())
3803     return ElementCount::getScalable(0);
3804 
3805   auto MaxScalableVF = ElementCount::getScalable(
3806       std::numeric_limits<ElementCount::ScalarTy>::max());
3807   if (Legal->isSafeForAnyVectorWidth())
3808     return MaxScalableVF;
3809 
3810   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3811   // Limit MaxScalableVF by the maximum safe dependence distance.
3812   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3813 
3814   if (!MaxScalableVF)
3815     reportVectorizationInfo(
3816         "Max legal vector width too small, scalable vectorization "
3817         "unfeasible.",
3818         "ScalableVFUnfeasible", ORE, TheLoop);
3819 
3820   return MaxScalableVF;
3821 }
3822 
3823 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3824     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3825   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3826   unsigned SmallestType, WidestType;
3827   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3828 
3829   // Get the maximum safe dependence distance in bits computed by LAA.
3830   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3831   // the memory accesses that is most restrictive (involved in the smallest
3832   // dependence distance).
3833   unsigned MaxSafeElements =
3834       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3835 
3836   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3837   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3838   if (!Legal->isSafeForAnyVectorWidth())
3839     this->MaxSafeElements = MaxSafeElements;
3840 
3841   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3842                     << ".\n");
3843   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3844                     << ".\n");
3845 
3846   // First analyze the UserVF, fall back if the UserVF should be ignored.
3847   if (UserVF) {
3848     auto MaxSafeUserVF =
3849         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3850 
3851     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3852       // If `VF=vscale x N` is safe, then so is `VF=N`
3853       if (UserVF.isScalable())
3854         return FixedScalableVFPair(
3855             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3856 
3857       return UserVF;
3858     }
3859 
3860     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3861 
3862     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3863     // is better to ignore the hint and let the compiler choose a suitable VF.
3864     if (!UserVF.isScalable()) {
3865       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3866                         << " is unsafe, clamping to max safe VF="
3867                         << MaxSafeFixedVF << ".\n");
3868       ORE->emit([&]() {
3869         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3870                                           TheLoop->getStartLoc(),
3871                                           TheLoop->getHeader())
3872                << "User-specified vectorization factor "
3873                << ore::NV("UserVectorizationFactor", UserVF)
3874                << " is unsafe, clamping to maximum safe vectorization factor "
3875                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3876       });
3877       return MaxSafeFixedVF;
3878     }
3879 
3880     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3881       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3882                         << " is ignored because scalable vectors are not "
3883                            "available.\n");
3884       ORE->emit([&]() {
3885         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3886                                           TheLoop->getStartLoc(),
3887                                           TheLoop->getHeader())
3888                << "User-specified vectorization factor "
3889                << ore::NV("UserVectorizationFactor", UserVF)
3890                << " is ignored because the target does not support scalable "
3891                   "vectors. The compiler will pick a more suitable value.";
3892       });
3893     } else {
3894       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3895                         << " is unsafe. Ignoring scalable UserVF.\n");
3896       ORE->emit([&]() {
3897         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3898                                           TheLoop->getStartLoc(),
3899                                           TheLoop->getHeader())
3900                << "User-specified vectorization factor "
3901                << ore::NV("UserVectorizationFactor", UserVF)
3902                << " is unsafe. Ignoring the hint to let the compiler pick a "
3903                   "more suitable value.";
3904       });
3905     }
3906   }
3907 
3908   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3909                     << " / " << WidestType << " bits.\n");
3910 
3911   FixedScalableVFPair Result(ElementCount::getFixed(1),
3912                              ElementCount::getScalable(0));
3913   if (auto MaxVF =
3914           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3915                                   MaxSafeFixedVF, FoldTailByMasking))
3916     Result.FixedVF = MaxVF;
3917 
3918   if (auto MaxVF =
3919           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3920                                   MaxSafeScalableVF, FoldTailByMasking))
3921     if (MaxVF.isScalable()) {
3922       Result.ScalableVF = MaxVF;
3923       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3924                         << "\n");
3925     }
3926 
3927   return Result;
3928 }
3929 
3930 FixedScalableVFPair
3931 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3932   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3933     // TODO: It may be useful to do since it's still likely to be dynamically
3934     // uniform if the target can skip.
3935     reportVectorizationFailure(
3936         "Not inserting runtime ptr check for divergent target",
3937         "runtime pointer checks needed. Not enabled for divergent target",
3938         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3939     return FixedScalableVFPair::getNone();
3940   }
3941 
3942   ScalarEvolution *SE = PSE.getSE();
3943   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
3944   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3945   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3946   if (TC != MaxTC)
3947     LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3948   if (TC == 1) {
3949     reportVectorizationFailure("Single iteration (non) loop",
3950         "loop trip count is one, irrelevant for vectorization",
3951         "SingleIterationLoop", ORE, TheLoop);
3952     return FixedScalableVFPair::getNone();
3953   }
3954 
3955   // If BTC matches the widest induction type and is -1 then the trip count
3956   // computation will wrap to 0 and the vector trip count will be 0. Do not try
3957   // to vectorize.
3958   const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3959   if (!isa<SCEVCouldNotCompute>(BTC) &&
3960       BTC->getType()->getScalarSizeInBits() >=
3961           Legal->getWidestInductionType()->getScalarSizeInBits() &&
3962       SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,
3963                            SE->getMinusOne(BTC->getType()))) {
3964     reportVectorizationFailure(
3965         "Trip count computation wrapped",
3966         "backedge-taken count is -1, loop trip count wrapped to 0",
3967         "TripCountWrapped", ORE, TheLoop);
3968     return FixedScalableVFPair::getNone();
3969   }
3970 
3971   switch (ScalarEpilogueStatus) {
3972   case CM_ScalarEpilogueAllowed:
3973     return computeFeasibleMaxVF(MaxTC, UserVF, false);
3974   case CM_ScalarEpilogueNotAllowedUsePredicate:
3975     [[fallthrough]];
3976   case CM_ScalarEpilogueNotNeededUsePredicate:
3977     LLVM_DEBUG(
3978         dbgs() << "LV: vector predicate hint/switch found.\n"
3979                << "LV: Not allowing scalar epilogue, creating predicated "
3980                << "vector loop.\n");
3981     break;
3982   case CM_ScalarEpilogueNotAllowedLowTripLoop:
3983     // fallthrough as a special case of OptForSize
3984   case CM_ScalarEpilogueNotAllowedOptSize:
3985     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3986       LLVM_DEBUG(
3987           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3988     else
3989       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3990                         << "count.\n");
3991 
3992     // Bail if runtime checks are required, which are not good when optimising
3993     // for size.
3994     if (runtimeChecksRequired())
3995       return FixedScalableVFPair::getNone();
3996 
3997     break;
3998   }
3999 
4000   // The only loops we can vectorize without a scalar epilogue, are loops with
4001   // a bottom-test and a single exiting block. We'd have to handle the fact
4002   // that not every instruction executes on the last iteration.  This will
4003   // require a lane mask which varies through the vector loop body.  (TODO)
4004   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4005     // If there was a tail-folding hint/switch, but we can't fold the tail by
4006     // masking, fallback to a vectorization with a scalar epilogue.
4007     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4008       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4009                            "scalar epilogue instead.\n");
4010       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4011       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4012     }
4013     return FixedScalableVFPair::getNone();
4014   }
4015 
4016   // Now try the tail folding
4017 
4018   // Invalidate interleave groups that require an epilogue if we can't mask
4019   // the interleave-group.
4020   if (!useMaskedInterleavedAccesses(TTI)) {
4021     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4022            "No decisions should have been taken at this point");
4023     // Note: There is no need to invalidate any cost modeling decisions here, as
4024     // none were taken so far.
4025     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4026   }
4027 
4028   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4029 
4030   // Avoid tail folding if the trip count is known to be a multiple of any VF
4031   // we choose.
4032   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4033       MaxFactors.FixedVF.getFixedValue();
4034   if (MaxFactors.ScalableVF) {
4035     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4036     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4037       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4038           *MaxPowerOf2RuntimeVF,
4039           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4040     } else
4041       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4042   }
4043 
4044   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4045     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4046            "MaxFixedVF must be a power of 2");
4047     unsigned MaxVFtimesIC =
4048         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4049     ScalarEvolution *SE = PSE.getSE();
4050     // Currently only loops with countable exits are vectorized, but calling
4051     // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4052     // uncountable exits whilst also ensuring the symbolic maximum and known
4053     // back-edge taken count remain identical for loops with countable exits.
4054     const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4055     assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4056            "Invalid loop count");
4057     const SCEV *ExitCount = SE->getAddExpr(
4058         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4059     const SCEV *Rem = SE->getURemExpr(
4060         SE->applyLoopGuards(ExitCount, TheLoop),
4061         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4062     if (Rem->isZero()) {
4063       // Accept MaxFixedVF if we do not have a tail.
4064       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4065       return MaxFactors;
4066     }
4067   }
4068 
4069   // If we don't know the precise trip count, or if the trip count that we
4070   // found modulo the vectorization factor is not zero, try to fold the tail
4071   // by masking.
4072   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4073   setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4074   if (foldTailByMasking()) {
4075     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4076       LLVM_DEBUG(
4077           dbgs()
4078           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4079              "try to generate VP Intrinsics with scalable vector "
4080              "factors only.\n");
4081       // Tail folded loop using VP intrinsics restricts the VF to be scalable
4082       // for now.
4083       // TODO: extend it for fixed vectors, if required.
4084       assert(MaxFactors.ScalableVF.isScalable() &&
4085              "Expected scalable vector factor.");
4086 
4087       MaxFactors.FixedVF = ElementCount::getFixed(1);
4088     }
4089     return MaxFactors;
4090   }
4091 
4092   // If there was a tail-folding hint/switch, but we can't fold the tail by
4093   // masking, fallback to a vectorization with a scalar epilogue.
4094   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4095     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4096                          "scalar epilogue instead.\n");
4097     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4098     return MaxFactors;
4099   }
4100 
4101   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4102     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4103     return FixedScalableVFPair::getNone();
4104   }
4105 
4106   if (TC == 0) {
4107     reportVectorizationFailure(
4108         "unable to calculate the loop count due to complex control flow",
4109         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4110     return FixedScalableVFPair::getNone();
4111   }
4112 
4113   reportVectorizationFailure(
4114       "Cannot optimize for size and vectorize at the same time.",
4115       "cannot optimize for size and vectorize at the same time. "
4116       "Enable vectorization of this loop with '#pragma clang loop "
4117       "vectorize(enable)' when compiling with -Os/-Oz",
4118       "NoTailLoopWithOptForSize", ORE, TheLoop);
4119   return FixedScalableVFPair::getNone();
4120 }
4121 
4122 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4123     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4124     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4125   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4126   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4127       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4128                            : TargetTransformInfo::RGK_FixedWidthVector);
4129 
4130   // Convenience function to return the minimum of two ElementCounts.
4131   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4132     assert((LHS.isScalable() == RHS.isScalable()) &&
4133            "Scalable flags must match");
4134     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4135   };
4136 
4137   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4138   // Note that both WidestRegister and WidestType may not be a powers of 2.
4139   auto MaxVectorElementCount = ElementCount::get(
4140       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4141       ComputeScalableMaxVF);
4142   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4143   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4144                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4145 
4146   if (!MaxVectorElementCount) {
4147     LLVM_DEBUG(dbgs() << "LV: The target has no "
4148                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4149                       << " vector registers.\n");
4150     return ElementCount::getFixed(1);
4151   }
4152 
4153   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4154   if (MaxVectorElementCount.isScalable() &&
4155       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4156     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4157     auto Min = Attr.getVScaleRangeMin();
4158     WidestRegisterMinEC *= Min;
4159   }
4160 
4161   // When a scalar epilogue is required, at least one iteration of the scalar
4162   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4163   // max VF that results in a dead vector loop.
4164   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4165     MaxTripCount -= 1;
4166 
4167   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4168       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4169     // If upper bound loop trip count (TC) is known at compile time there is no
4170     // point in choosing VF greater than TC (as done in the loop below). Select
4171     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4172     // scalable, we only fall back on a fixed VF when the TC is less than or
4173     // equal to the known number of lanes.
4174     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4175     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4176                          "exceeding the constant trip count: "
4177                       << ClampedUpperTripCount << "\n");
4178     return ElementCount::get(
4179         ClampedUpperTripCount,
4180         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4181   }
4182 
4183   TargetTransformInfo::RegisterKind RegKind =
4184       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4185                            : TargetTransformInfo::RGK_FixedWidthVector;
4186   ElementCount MaxVF = MaxVectorElementCount;
4187   if (MaximizeBandwidth ||
4188       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4189        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4190         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4191     auto MaxVectorElementCountMaxBW = ElementCount::get(
4192         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4193         ComputeScalableMaxVF);
4194     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4195 
4196     // Collect all viable vectorization factors larger than the default MaxVF
4197     // (i.e. MaxVectorElementCount).
4198     SmallVector<ElementCount, 8> VFs;
4199     for (ElementCount VS = MaxVectorElementCount * 2;
4200          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4201       VFs.push_back(VS);
4202 
4203     // For each VF calculate its register usage.
4204     auto RUs = calculateRegisterUsage(VFs);
4205 
4206     // Select the largest VF which doesn't require more registers than existing
4207     // ones.
4208     for (int I = RUs.size() - 1; I >= 0; --I) {
4209       const auto &MLU = RUs[I].MaxLocalUsers;
4210       if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4211             return LU.second <= TTI.getNumberOfRegisters(LU.first);
4212           })) {
4213         MaxVF = VFs[I];
4214         break;
4215       }
4216     }
4217     if (ElementCount MinVF =
4218             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4219       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4220         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4221                           << ") with target's minimum: " << MinVF << '\n');
4222         MaxVF = MinVF;
4223       }
4224     }
4225 
4226     // Invalidate any widening decisions we might have made, in case the loop
4227     // requires prediction (decided later), but we have already made some
4228     // load/store widening decisions.
4229     invalidateCostModelingDecisions();
4230   }
4231   return MaxVF;
4232 }
4233 
4234 /// Convenience function that returns the value of vscale_range iff
4235 /// vscale_range.min == vscale_range.max or otherwise returns the value
4236 /// returned by the corresponding TTI method.
4237 static std::optional<unsigned>
4238 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4239   const Function *Fn = L->getHeader()->getParent();
4240   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4241     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4242     auto Min = Attr.getVScaleRangeMin();
4243     auto Max = Attr.getVScaleRangeMax();
4244     if (Max && Min == Max)
4245       return Max;
4246   }
4247 
4248   return TTI.getVScaleForTuning();
4249 }
4250 
4251 /// This function attempts to return a value that represents the vectorization
4252 /// factor at runtime. For fixed-width VFs we know this precisely at compile
4253 /// time, but for scalable VFs we calculate it based on an estimate of the
4254 /// vscale value.
4255 static unsigned getEstimatedRuntimeVF(const Loop *L,
4256                                       const TargetTransformInfo &TTI,
4257                                       ElementCount VF) {
4258   unsigned EstimatedVF = VF.getKnownMinValue();
4259   if (VF.isScalable())
4260     if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4261       EstimatedVF *= *VScale;
4262   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4263   return EstimatedVF;
4264 }
4265 
4266 bool LoopVectorizationPlanner::isMoreProfitable(
4267     const VectorizationFactor &A, const VectorizationFactor &B,
4268     const unsigned MaxTripCount) const {
4269   InstructionCost CostA = A.Cost;
4270   InstructionCost CostB = B.Cost;
4271 
4272   // Improve estimate for the vector width if it is scalable.
4273   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4274   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4275   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4276     if (A.Width.isScalable())
4277       EstimatedWidthA *= *VScale;
4278     if (B.Width.isScalable())
4279       EstimatedWidthB *= *VScale;
4280   }
4281 
4282   // Assume vscale may be larger than 1 (or the value being tuned for),
4283   // so that scalable vectorization is slightly favorable over fixed-width
4284   // vectorization.
4285   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4286                         A.Width.isScalable() && !B.Width.isScalable();
4287 
4288   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4289                                 const InstructionCost &RHS) {
4290     return PreferScalable ? LHS <= RHS : LHS < RHS;
4291   };
4292 
4293   // To avoid the need for FP division:
4294   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4295   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4296   if (!MaxTripCount)
4297     return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4298 
4299   auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4300                                            InstructionCost VectorCost,
4301                                            InstructionCost ScalarCost) {
4302     // If the trip count is a known (possibly small) constant, the trip count
4303     // will be rounded up to an integer number of iterations under
4304     // FoldTailByMasking. The total cost in that case will be
4305     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4306     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4307     // some extra overheads, but for the purpose of comparing the costs of
4308     // different VFs we can use this to compare the total loop-body cost
4309     // expected after vectorization.
4310     if (CM.foldTailByMasking())
4311       return VectorCost * divideCeil(MaxTripCount, VF);
4312     return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4313   };
4314 
4315   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4316   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4317   return CmpFn(RTCostA, RTCostB);
4318 }
4319 
4320 bool LoopVectorizationPlanner::isMoreProfitable(
4321     const VectorizationFactor &A, const VectorizationFactor &B) const {
4322   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4323   return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4324 }
4325 
4326 void LoopVectorizationPlanner::emitInvalidCostRemarks(
4327     OptimizationRemarkEmitter *ORE) {
4328   using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4329   SmallVector<RecipeVFPair> InvalidCosts;
4330   for (const auto &Plan : VPlans) {
4331     for (ElementCount VF : Plan->vectorFactors()) {
4332       VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4333                             CM, CM.CostKind);
4334       precomputeCosts(*Plan, VF, CostCtx);
4335       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4336       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4337         for (auto &R : *VPBB) {
4338           if (!R.cost(VF, CostCtx).isValid())
4339             InvalidCosts.emplace_back(&R, VF);
4340         }
4341       }
4342     }
4343   }
4344   if (InvalidCosts.empty())
4345     return;
4346 
4347   // Emit a report of VFs with invalid costs in the loop.
4348 
4349   // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4350   DenseMap<VPRecipeBase *, unsigned> Numbering;
4351   unsigned I = 0;
4352   for (auto &Pair : InvalidCosts)
4353     if (!Numbering.count(Pair.first))
4354       Numbering[Pair.first] = I++;
4355 
4356   // Sort the list, first on recipe(number) then on VF.
4357   sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4358     if (Numbering[A.first] != Numbering[B.first])
4359       return Numbering[A.first] < Numbering[B.first];
4360     const auto &LHS = A.second;
4361     const auto &RHS = B.second;
4362     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4363            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4364   });
4365 
4366   // For a list of ordered recipe-VF pairs:
4367   //   [(load, VF1), (load, VF2), (store, VF1)]
4368   // group the recipes together to emit separate remarks for:
4369   //   load  (VF1, VF2)
4370   //   store (VF1)
4371   auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4372   auto Subset = ArrayRef<RecipeVFPair>();
4373   do {
4374     if (Subset.empty())
4375       Subset = Tail.take_front(1);
4376 
4377     VPRecipeBase *R = Subset.front().first;
4378 
4379     unsigned Opcode =
4380         TypeSwitch<const VPRecipeBase *, unsigned>(R)
4381             .Case<VPHeaderPHIRecipe>(
4382                 [](const auto *R) { return Instruction::PHI; })
4383             .Case<VPWidenSelectRecipe>(
4384                 [](const auto *R) { return Instruction::Select; })
4385             .Case<VPWidenStoreRecipe>(
4386                 [](const auto *R) { return Instruction::Store; })
4387             .Case<VPWidenLoadRecipe>(
4388                 [](const auto *R) { return Instruction::Load; })
4389             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4390                 [](const auto *R) { return Instruction::Call; })
4391             .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4392                   VPWidenCastRecipe>(
4393                 [](const auto *R) { return R->getOpcode(); })
4394             .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4395               return R->getStoredValues().empty() ? Instruction::Load
4396                                                   : Instruction::Store;
4397             });
4398 
4399     // If the next recipe is different, or if there are no other pairs,
4400     // emit a remark for the collated subset. e.g.
4401     //   [(load, VF1), (load, VF2))]
4402     // to emit:
4403     //  remark: invalid costs for 'load' at VF=(VF1, VF2)
4404     if (Subset == Tail || Tail[Subset.size()].first != R) {
4405       std::string OutString;
4406       raw_string_ostream OS(OutString);
4407       assert(!Subset.empty() && "Unexpected empty range");
4408       OS << "Recipe with invalid costs prevented vectorization at VF=(";
4409       for (const auto &Pair : Subset)
4410         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4411       OS << "):";
4412       if (Opcode == Instruction::Call) {
4413         StringRef Name = "";
4414         if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4415           Name = Int->getIntrinsicName();
4416         } else {
4417           auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4418           Function *CalledFn =
4419               WidenCall ? WidenCall->getCalledScalarFunction()
4420                         : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4421                                              ->getLiveInIRValue());
4422           Name = CalledFn->getName();
4423         }
4424         OS << " call to " << Name;
4425       } else
4426         OS << " " << Instruction::getOpcodeName(Opcode);
4427       reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4428                               R->getDebugLoc());
4429       Tail = Tail.drop_front(Subset.size());
4430       Subset = {};
4431     } else
4432       // Grow the subset by one element
4433       Subset = Tail.take_front(Subset.size() + 1);
4434   } while (!Tail.empty());
4435 }
4436 
4437 /// Check if any recipe of \p Plan will generate a vector value, which will be
4438 /// assigned a vector register.
4439 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4440                                 const TargetTransformInfo &TTI) {
4441   assert(VF.isVector() && "Checking a scalar VF?");
4442   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4443   DenseSet<VPRecipeBase *> EphemeralRecipes;
4444   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4445   // Set of already visited types.
4446   DenseSet<Type *> Visited;
4447   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4448            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4449     for (VPRecipeBase &R : *VPBB) {
4450       if (EphemeralRecipes.contains(&R))
4451         continue;
4452       // Continue early if the recipe is considered to not produce a vector
4453       // result. Note that this includes VPInstruction where some opcodes may
4454       // produce a vector, to preserve existing behavior as VPInstructions model
4455       // aspects not directly mapped to existing IR instructions.
4456       switch (R.getVPDefID()) {
4457       case VPDef::VPDerivedIVSC:
4458       case VPDef::VPScalarIVStepsSC:
4459       case VPDef::VPScalarCastSC:
4460       case VPDef::VPReplicateSC:
4461       case VPDef::VPInstructionSC:
4462       case VPDef::VPCanonicalIVPHISC:
4463       case VPDef::VPVectorPointerSC:
4464       case VPDef::VPReverseVectorPointerSC:
4465       case VPDef::VPExpandSCEVSC:
4466       case VPDef::VPEVLBasedIVPHISC:
4467       case VPDef::VPPredInstPHISC:
4468       case VPDef::VPBranchOnMaskSC:
4469         continue;
4470       case VPDef::VPReductionSC:
4471       case VPDef::VPActiveLaneMaskPHISC:
4472       case VPDef::VPWidenCallSC:
4473       case VPDef::VPWidenCanonicalIVSC:
4474       case VPDef::VPWidenCastSC:
4475       case VPDef::VPWidenGEPSC:
4476       case VPDef::VPWidenIntrinsicSC:
4477       case VPDef::VPWidenSC:
4478       case VPDef::VPWidenSelectSC:
4479       case VPDef::VPBlendSC:
4480       case VPDef::VPFirstOrderRecurrencePHISC:
4481       case VPDef::VPWidenPHISC:
4482       case VPDef::VPWidenIntOrFpInductionSC:
4483       case VPDef::VPWidenPointerInductionSC:
4484       case VPDef::VPReductionPHISC:
4485       case VPDef::VPInterleaveSC:
4486       case VPDef::VPWidenLoadEVLSC:
4487       case VPDef::VPWidenLoadSC:
4488       case VPDef::VPWidenStoreEVLSC:
4489       case VPDef::VPWidenStoreSC:
4490         break;
4491       default:
4492         llvm_unreachable("unhandled recipe");
4493       }
4494 
4495       auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4496         Type *VectorTy = toVectorTy(ScalarTy, VF);
4497         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4498         if (!NumLegalParts)
4499           return false;
4500         if (VF.isScalable()) {
4501           // <vscale x 1 x iN> is assumed to be profitable over iN because
4502           // scalable registers are a distinct register class from scalar
4503           // ones. If we ever find a target which wants to lower scalable
4504           // vectors back to scalars, we'll need to update this code to
4505           // explicitly ask TTI about the register class uses for each part.
4506           return NumLegalParts <= VF.getKnownMinValue();
4507         }
4508         // Two or more parts that share a register - are vectorized.
4509         return NumLegalParts < VF.getKnownMinValue();
4510       };
4511 
4512       // If no def nor is a store, e.g., branches, continue - no value to check.
4513       if (R.getNumDefinedValues() == 0 &&
4514           !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4515               &R))
4516         continue;
4517       // For multi-def recipes, currently only interleaved loads, suffice to
4518       // check first def only.
4519       // For stores check their stored value; for interleaved stores suffice
4520       // the check first stored value only. In all cases this is the second
4521       // operand.
4522       VPValue *ToCheck =
4523           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4524       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4525       if (!Visited.insert({ScalarTy}).second)
4526         continue;
4527       if (WillWiden(ScalarTy))
4528         return true;
4529     }
4530   }
4531 
4532   return false;
4533 }
4534 
4535 #ifndef NDEBUG
4536 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4537   InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4538   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4539   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4540   assert(any_of(VPlans,
4541                 [](std::unique_ptr<VPlan> &P) {
4542                   return P->hasVF(ElementCount::getFixed(1));
4543                 }) &&
4544          "Expected Scalar VF to be a candidate");
4545 
4546   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4547                                        ExpectedCost);
4548   VectorizationFactor ChosenFactor = ScalarCost;
4549 
4550   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4551   if (ForceVectorization &&
4552       (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4553     // Ignore scalar width, because the user explicitly wants vectorization.
4554     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4555     // evaluation.
4556     ChosenFactor.Cost = InstructionCost::getMax();
4557   }
4558 
4559   for (auto &P : VPlans) {
4560     for (ElementCount VF : P->vectorFactors()) {
4561       // The cost for scalar VF=1 is already calculated, so ignore it.
4562       if (VF.isScalar())
4563         continue;
4564 
4565       InstructionCost C = CM.expectedCost(VF);
4566       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4567 
4568       unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4569       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4570                         << " costs: " << (Candidate.Cost / Width));
4571       if (VF.isScalable())
4572         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4573                           << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4574                           << ")");
4575       LLVM_DEBUG(dbgs() << ".\n");
4576 
4577       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4578         LLVM_DEBUG(
4579             dbgs()
4580             << "LV: Not considering vector loop of width " << VF
4581             << " because it will not generate any vector instructions.\n");
4582         continue;
4583       }
4584 
4585       if (isMoreProfitable(Candidate, ChosenFactor))
4586         ChosenFactor = Candidate;
4587     }
4588   }
4589 
4590   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4591     reportVectorizationFailure(
4592         "There are conditional stores.",
4593         "store that is conditionally executed prevents vectorization",
4594         "ConditionalStore", ORE, OrigLoop);
4595     ChosenFactor = ScalarCost;
4596   }
4597 
4598   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4599                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4600              << "LV: Vectorization seems to be not beneficial, "
4601              << "but was forced by a user.\n");
4602   return ChosenFactor;
4603 }
4604 #endif
4605 
4606 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4607     ElementCount VF) const {
4608   // Cross iteration phis such as reductions need special handling and are
4609   // currently unsupported.
4610   if (any_of(OrigLoop->getHeader()->phis(),
4611              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4612     return false;
4613 
4614   // Phis with uses outside of the loop require special handling and are
4615   // currently unsupported.
4616   for (const auto &Entry : Legal->getInductionVars()) {
4617     // Look for uses of the value of the induction at the last iteration.
4618     Value *PostInc =
4619         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4620     for (User *U : PostInc->users())
4621       if (!OrigLoop->contains(cast<Instruction>(U)))
4622         return false;
4623     // Look for uses of penultimate value of the induction.
4624     for (User *U : Entry.first->users())
4625       if (!OrigLoop->contains(cast<Instruction>(U)))
4626         return false;
4627   }
4628 
4629   // Epilogue vectorization code has not been auditted to ensure it handles
4630   // non-latch exits properly.  It may be fine, but it needs auditted and
4631   // tested.
4632   // TODO: Add support for loops with an early exit.
4633   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4634     return false;
4635 
4636   return true;
4637 }
4638 
4639 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4640     const ElementCount VF, const unsigned IC) const {
4641   // FIXME: We need a much better cost-model to take different parameters such
4642   // as register pressure, code size increase and cost of extra branches into
4643   // account. For now we apply a very crude heuristic and only consider loops
4644   // with vectorization factors larger than a certain value.
4645 
4646   // Allow the target to opt out entirely.
4647   if (!TTI.preferEpilogueVectorization())
4648     return false;
4649 
4650   // We also consider epilogue vectorization unprofitable for targets that don't
4651   // consider interleaving beneficial (eg. MVE).
4652   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4653     return false;
4654 
4655   // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4656   // VFs when deciding profitability.
4657   // See related "TODO: extend to support scalable VFs." in
4658   // selectEpilogueVectorizationFactor.
4659   unsigned Multiplier = VF.isFixed() ? IC : 1;
4660   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4661                                 ? EpilogueVectorizationMinVF
4662                                 : TTI.getEpilogueVectorizationMinVF();
4663   return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4664 }
4665 
4666 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4667     const ElementCount MainLoopVF, unsigned IC) {
4668   VectorizationFactor Result = VectorizationFactor::Disabled();
4669   if (!EnableEpilogueVectorization) {
4670     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4671     return Result;
4672   }
4673 
4674   if (!CM.isScalarEpilogueAllowed()) {
4675     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4676                          "epilogue is allowed.\n");
4677     return Result;
4678   }
4679 
4680   // Not really a cost consideration, but check for unsupported cases here to
4681   // simplify the logic.
4682   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4683     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4684                          "is not a supported candidate.\n");
4685     return Result;
4686   }
4687 
4688   if (EpilogueVectorizationForceVF > 1) {
4689     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4690     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4691     if (hasPlanWithVF(ForcedEC))
4692       return {ForcedEC, 0, 0};
4693 
4694     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4695                          "viable.\n");
4696     return Result;
4697   }
4698 
4699   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4700       OrigLoop->getHeader()->getParent()->hasMinSize()) {
4701     LLVM_DEBUG(
4702         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4703     return Result;
4704   }
4705 
4706   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4707     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4708                          "this loop\n");
4709     return Result;
4710   }
4711 
4712   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4713   // the main loop handles 8 lanes per iteration. We could still benefit from
4714   // vectorizing the epilogue loop with VF=4.
4715   ElementCount EstimatedRuntimeVF =
4716       ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4717 
4718   ScalarEvolution &SE = *PSE.getSE();
4719   Type *TCType = Legal->getWidestInductionType();
4720   const SCEV *RemainingIterations = nullptr;
4721   unsigned MaxTripCount = 0;
4722   for (auto &NextVF : ProfitableVFs) {
4723     // Skip candidate VFs without a corresponding VPlan.
4724     if (!hasPlanWithVF(NextVF.Width))
4725       continue;
4726 
4727     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4728     // vectors) or > the VF of the main loop (fixed vectors).
4729     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4730          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4731         (NextVF.Width.isScalable() &&
4732          ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4733         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4734          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4735       continue;
4736 
4737     // If NextVF is greater than the number of remaining iterations, the
4738     // epilogue loop would be dead. Skip such factors.
4739     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4740       // TODO: extend to support scalable VFs.
4741       if (!RemainingIterations) {
4742         const SCEV *TC = vputils::getSCEVExprForVPValue(
4743             getPlanFor(NextVF.Width).getTripCount(), SE);
4744         assert(!isa<SCEVCouldNotCompute>(TC) &&
4745                "Trip count SCEV must be computable");
4746         RemainingIterations = SE.getURemExpr(
4747             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4748         MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4749         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4750                                 SE.getConstant(TCType, MaxTripCount))) {
4751           MaxTripCount =
4752               SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4753         }
4754         LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4755                           << MaxTripCount << "\n");
4756       }
4757       if (SE.isKnownPredicate(
4758               CmpInst::ICMP_UGT,
4759               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4760               RemainingIterations))
4761         continue;
4762     }
4763 
4764     if (Result.Width.isScalar() ||
4765         isMoreProfitable(NextVF, Result, MaxTripCount))
4766       Result = NextVF;
4767   }
4768 
4769   if (Result != VectorizationFactor::Disabled())
4770     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4771                       << Result.Width << "\n");
4772   return Result;
4773 }
4774 
4775 std::pair<unsigned, unsigned>
4776 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4777   unsigned MinWidth = -1U;
4778   unsigned MaxWidth = 8;
4779   const DataLayout &DL = TheFunction->getDataLayout();
4780   // For in-loop reductions, no element types are added to ElementTypesInLoop
4781   // if there are no loads/stores in the loop. In this case, check through the
4782   // reduction variables to determine the maximum width.
4783   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4784     // Reset MaxWidth so that we can find the smallest type used by recurrences
4785     // in the loop.
4786     MaxWidth = -1U;
4787     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4788       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4789       // When finding the min width used by the recurrence we need to account
4790       // for casts on the input operands of the recurrence.
4791       MaxWidth = std::min<unsigned>(
4792           MaxWidth, std::min<unsigned>(
4793                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4794                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4795     }
4796   } else {
4797     for (Type *T : ElementTypesInLoop) {
4798       MinWidth = std::min<unsigned>(
4799           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4800       MaxWidth = std::max<unsigned>(
4801           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4802     }
4803   }
4804   return {MinWidth, MaxWidth};
4805 }
4806 
4807 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4808   ElementTypesInLoop.clear();
4809   // For each block.
4810   for (BasicBlock *BB : TheLoop->blocks()) {
4811     // For each instruction in the loop.
4812     for (Instruction &I : BB->instructionsWithoutDebug()) {
4813       Type *T = I.getType();
4814 
4815       // Skip ignored values.
4816       if (ValuesToIgnore.count(&I))
4817         continue;
4818 
4819       // Only examine Loads, Stores and PHINodes.
4820       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4821         continue;
4822 
4823       // Examine PHI nodes that are reduction variables. Update the type to
4824       // account for the recurrence type.
4825       if (auto *PN = dyn_cast<PHINode>(&I)) {
4826         if (!Legal->isReductionVariable(PN))
4827           continue;
4828         const RecurrenceDescriptor &RdxDesc =
4829             Legal->getReductionVars().find(PN)->second;
4830         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4831             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
4832                                       RdxDesc.getRecurrenceType(),
4833                                       TargetTransformInfo::ReductionFlags()))
4834           continue;
4835         T = RdxDesc.getRecurrenceType();
4836       }
4837 
4838       // Examine the stored values.
4839       if (auto *ST = dyn_cast<StoreInst>(&I))
4840         T = ST->getValueOperand()->getType();
4841 
4842       assert(T->isSized() &&
4843              "Expected the load/store/recurrence type to be sized");
4844 
4845       ElementTypesInLoop.insert(T);
4846     }
4847   }
4848 }
4849 
4850 unsigned
4851 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4852                                                   InstructionCost LoopCost) {
4853   // -- The interleave heuristics --
4854   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4855   // There are many micro-architectural considerations that we can't predict
4856   // at this level. For example, frontend pressure (on decode or fetch) due to
4857   // code size, or the number and capabilities of the execution ports.
4858   //
4859   // We use the following heuristics to select the interleave count:
4860   // 1. If the code has reductions, then we interleave to break the cross
4861   // iteration dependency.
4862   // 2. If the loop is really small, then we interleave to reduce the loop
4863   // overhead.
4864   // 3. We don't interleave if we think that we will spill registers to memory
4865   // due to the increased register pressure.
4866 
4867   if (!isScalarEpilogueAllowed())
4868     return 1;
4869 
4870   // Do not interleave if EVL is preferred and no User IC is specified.
4871   if (foldTailWithEVL()) {
4872     LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4873                          "Unroll factor forced to be 1.\n");
4874     return 1;
4875   }
4876 
4877   // We used the distance for the interleave count.
4878   if (!Legal->isSafeForAnyVectorWidth())
4879     return 1;
4880 
4881   // We don't attempt to perform interleaving for loops with uncountable early
4882   // exits because the VPInstruction::AnyOf code cannot currently handle
4883   // multiple parts.
4884   if (Legal->hasUncountableEarlyExit())
4885     return 1;
4886 
4887   auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4888   const bool HasReductions = !Legal->getReductionVars().empty();
4889 
4890   // If we did not calculate the cost for VF (because the user selected the VF)
4891   // then we calculate the cost of VF here.
4892   if (LoopCost == 0) {
4893     LoopCost = expectedCost(VF);
4894     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4895 
4896     // Loop body is free and there is no need for interleaving.
4897     if (LoopCost == 0)
4898       return 1;
4899   }
4900 
4901   RegisterUsage R = calculateRegisterUsage({VF})[0];
4902   // We divide by these constants so assume that we have at least one
4903   // instruction that uses at least one register.
4904   for (auto &Pair : R.MaxLocalUsers) {
4905     Pair.second = std::max(Pair.second, 1U);
4906   }
4907 
4908   // We calculate the interleave count using the following formula.
4909   // Subtract the number of loop invariants from the number of available
4910   // registers. These registers are used by all of the interleaved instances.
4911   // Next, divide the remaining registers by the number of registers that is
4912   // required by the loop, in order to estimate how many parallel instances
4913   // fit without causing spills. All of this is rounded down if necessary to be
4914   // a power of two. We want power of two interleave count to simplify any
4915   // addressing operations or alignment considerations.
4916   // We also want power of two interleave counts to ensure that the induction
4917   // variable of the vector loop wraps to zero, when tail is folded by masking;
4918   // this currently happens when OptForSize, in which case IC is set to 1 above.
4919   unsigned IC = UINT_MAX;
4920 
4921   for (const auto &Pair : R.MaxLocalUsers) {
4922     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4923     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4924                       << " registers of "
4925                       << TTI.getRegisterClassName(Pair.first)
4926                       << " register class\n");
4927     if (VF.isScalar()) {
4928       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4929         TargetNumRegisters = ForceTargetNumScalarRegs;
4930     } else {
4931       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4932         TargetNumRegisters = ForceTargetNumVectorRegs;
4933     }
4934     unsigned MaxLocalUsers = Pair.second;
4935     unsigned LoopInvariantRegs = 0;
4936     if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
4937       LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4938 
4939     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4940                                      MaxLocalUsers);
4941     // Don't count the induction variable as interleaved.
4942     if (EnableIndVarRegisterHeur) {
4943       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4944                               std::max(1U, (MaxLocalUsers - 1)));
4945     }
4946 
4947     IC = std::min(IC, TmpIC);
4948   }
4949 
4950   // Clamp the interleave ranges to reasonable counts.
4951   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4952 
4953   // Check if the user has overridden the max.
4954   if (VF.isScalar()) {
4955     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4956       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4957   } else {
4958     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4959       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4960   }
4961 
4962   unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
4963   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4964   if (KnownTC > 0) {
4965     // At least one iteration must be scalar when this constraint holds. So the
4966     // maximum available iterations for interleaving is one less.
4967     unsigned AvailableTC =
4968         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4969 
4970     // If trip count is known we select between two prospective ICs, where
4971     // 1) the aggressive IC is capped by the trip count divided by VF
4972     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4973     // The final IC is selected in a way that the epilogue loop trip count is
4974     // minimized while maximizing the IC itself, so that we either run the
4975     // vector loop at least once if it generates a small epilogue loop, or else
4976     // we run the vector loop at least twice.
4977 
4978     unsigned InterleaveCountUB = bit_floor(
4979         std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4980     unsigned InterleaveCountLB = bit_floor(std::max(
4981         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4982     MaxInterleaveCount = InterleaveCountLB;
4983 
4984     if (InterleaveCountUB != InterleaveCountLB) {
4985       unsigned TailTripCountUB =
4986           (AvailableTC % (EstimatedVF * InterleaveCountUB));
4987       unsigned TailTripCountLB =
4988           (AvailableTC % (EstimatedVF * InterleaveCountLB));
4989       // If both produce same scalar tail, maximize the IC to do the same work
4990       // in fewer vector loop iterations
4991       if (TailTripCountUB == TailTripCountLB)
4992         MaxInterleaveCount = InterleaveCountUB;
4993     }
4994   } else if (BestKnownTC && *BestKnownTC > 0) {
4995     // At least one iteration must be scalar when this constraint holds. So the
4996     // maximum available iterations for interleaving is one less.
4997     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4998                                ? (*BestKnownTC) - 1
4999                                : *BestKnownTC;
5000 
5001     // If trip count is an estimated compile time constant, limit the
5002     // IC to be capped by the trip count divided by VF * 2, such that the vector
5003     // loop runs at least twice to make interleaving seem profitable when there
5004     // is an epilogue loop present. Since exact Trip count is not known we
5005     // choose to be conservative in our IC estimate.
5006     MaxInterleaveCount = bit_floor(std::max(
5007         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5008   }
5009 
5010   assert(MaxInterleaveCount > 0 &&
5011          "Maximum interleave count must be greater than 0");
5012 
5013   // Clamp the calculated IC to be between the 1 and the max interleave count
5014   // that the target and trip count allows.
5015   if (IC > MaxInterleaveCount)
5016     IC = MaxInterleaveCount;
5017   else
5018     // Make sure IC is greater than 0.
5019     IC = std::max(1u, IC);
5020 
5021   assert(IC > 0 && "Interleave count must be greater than 0.");
5022 
5023   // Interleave if we vectorized this loop and there is a reduction that could
5024   // benefit from interleaving.
5025   if (VF.isVector() && HasReductions) {
5026     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5027     return IC;
5028   }
5029 
5030   // For any scalar loop that either requires runtime checks or predication we
5031   // are better off leaving this to the unroller. Note that if we've already
5032   // vectorized the loop we will have done the runtime check and so interleaving
5033   // won't require further checks.
5034   bool ScalarInterleavingRequiresPredication =
5035       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5036          return Legal->blockNeedsPredication(BB);
5037        }));
5038   bool ScalarInterleavingRequiresRuntimePointerCheck =
5039       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5040 
5041   // We want to interleave small loops in order to reduce the loop overhead and
5042   // potentially expose ILP opportunities.
5043   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5044                     << "LV: IC is " << IC << '\n'
5045                     << "LV: VF is " << VF << '\n');
5046   const bool AggressivelyInterleaveReductions =
5047       TTI.enableAggressiveInterleaving(HasReductions);
5048   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5049       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5050     // We assume that the cost overhead is 1 and we use the cost model
5051     // to estimate the cost of the loop and interleave until the cost of the
5052     // loop overhead is about 5% of the cost of the loop.
5053     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5054                                         SmallLoopCost / *LoopCost.getValue()));
5055 
5056     // Interleave until store/load ports (estimated by max interleave count) are
5057     // saturated.
5058     unsigned NumStores = Legal->getNumStores();
5059     unsigned NumLoads = Legal->getNumLoads();
5060     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5061     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5062 
5063     // There is little point in interleaving for reductions containing selects
5064     // and compares when VF=1 since it may just create more overhead than it's
5065     // worth for loops with small trip counts. This is because we still have to
5066     // do the final reduction after the loop.
5067     bool HasSelectCmpReductions =
5068         HasReductions &&
5069         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5070           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5071           RecurKind RK = RdxDesc.getRecurrenceKind();
5072           return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5073                  RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5074         });
5075     if (HasSelectCmpReductions) {
5076       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5077       return 1;
5078     }
5079 
5080     // If we have a scalar reduction (vector reductions are already dealt with
5081     // by this point), we can increase the critical path length if the loop
5082     // we're interleaving is inside another loop. For tree-wise reductions
5083     // set the limit to 2, and for ordered reductions it's best to disable
5084     // interleaving entirely.
5085     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5086       bool HasOrderedReductions =
5087           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5088             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5089             return RdxDesc.isOrdered();
5090           });
5091       if (HasOrderedReductions) {
5092         LLVM_DEBUG(
5093             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5094         return 1;
5095       }
5096 
5097       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5098       SmallIC = std::min(SmallIC, F);
5099       StoresIC = std::min(StoresIC, F);
5100       LoadsIC = std::min(LoadsIC, F);
5101     }
5102 
5103     if (EnableLoadStoreRuntimeInterleave &&
5104         std::max(StoresIC, LoadsIC) > SmallIC) {
5105       LLVM_DEBUG(
5106           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5107       return std::max(StoresIC, LoadsIC);
5108     }
5109 
5110     // If there are scalar reductions and TTI has enabled aggressive
5111     // interleaving for reductions, we will interleave to expose ILP.
5112     if (VF.isScalar() && AggressivelyInterleaveReductions) {
5113       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5114       // Interleave no less than SmallIC but not as aggressive as the normal IC
5115       // to satisfy the rare situation when resources are too limited.
5116       return std::max(IC / 2, SmallIC);
5117     }
5118 
5119     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5120     return SmallIC;
5121   }
5122 
5123   // Interleave if this is a large loop (small loops are already dealt with by
5124   // this point) that could benefit from interleaving.
5125   if (AggressivelyInterleaveReductions) {
5126     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5127     return IC;
5128   }
5129 
5130   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5131   return 1;
5132 }
5133 
5134 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5135 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5136   // This function calculates the register usage by measuring the highest number
5137   // of values that are alive at a single location. Obviously, this is a very
5138   // rough estimation. We scan the loop in a topological order in order and
5139   // assign a number to each instruction. We use RPO to ensure that defs are
5140   // met before their users. We assume that each instruction that has in-loop
5141   // users starts an interval. We record every time that an in-loop value is
5142   // used, so we have a list of the first and last occurrences of each
5143   // instruction. Next, we transpose this data structure into a multi map that
5144   // holds the list of intervals that *end* at a specific location. This multi
5145   // map allows us to perform a linear search. We scan the instructions linearly
5146   // and record each time that a new interval starts, by placing it in a set.
5147   // If we find this value in the multi-map then we remove it from the set.
5148   // The max register usage is the maximum size of the set.
5149   // We also search for instructions that are defined outside the loop, but are
5150   // used inside the loop. We need this number separately from the max-interval
5151   // usage number because when we unroll, loop-invariant values do not take
5152   // more register.
5153   LoopBlocksDFS DFS(TheLoop);
5154   DFS.perform(LI);
5155 
5156   RegisterUsage RU;
5157 
5158   // Each 'key' in the map opens a new interval. The values
5159   // of the map are the index of the 'last seen' usage of the
5160   // instruction that is the key.
5161   using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5162 
5163   // Maps instruction to its index.
5164   SmallVector<Instruction *, 64> IdxToInstr;
5165   // Marks the end of each interval.
5166   IntervalMap EndPoint;
5167   // Saves the list of instruction indices that are used in the loop.
5168   SmallPtrSet<Instruction *, 8> Ends;
5169   // Saves the list of values that are used in the loop but are defined outside
5170   // the loop (not including non-instruction values such as arguments and
5171   // constants).
5172   SmallSetVector<Instruction *, 8> LoopInvariants;
5173 
5174   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5175     for (Instruction &I : BB->instructionsWithoutDebug()) {
5176       IdxToInstr.push_back(&I);
5177 
5178       // Save the end location of each USE.
5179       for (Value *U : I.operands()) {
5180         auto *Instr = dyn_cast<Instruction>(U);
5181 
5182         // Ignore non-instruction values such as arguments, constants, etc.
5183         // FIXME: Might need some motivation why these values are ignored. If
5184         // for example an argument is used inside the loop it will increase the
5185         // register pressure (so shouldn't we add it to LoopInvariants).
5186         if (!Instr)
5187           continue;
5188 
5189         // If this instruction is outside the loop then record it and continue.
5190         if (!TheLoop->contains(Instr)) {
5191           LoopInvariants.insert(Instr);
5192           continue;
5193         }
5194 
5195         // Overwrite previous end points.
5196         EndPoint[Instr] = IdxToInstr.size();
5197         Ends.insert(Instr);
5198       }
5199     }
5200   }
5201 
5202   // Saves the list of intervals that end with the index in 'key'.
5203   using InstrList = SmallVector<Instruction *, 2>;
5204   SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5205 
5206   // Transpose the EndPoints to a list of values that end at each index.
5207   for (auto &Interval : EndPoint)
5208     TransposeEnds[Interval.second].push_back(Interval.first);
5209 
5210   SmallPtrSet<Instruction *, 8> OpenIntervals;
5211   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5212   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5213 
5214   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5215 
5216   const auto &TTICapture = TTI;
5217   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5218     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5219         (VF.isScalable() &&
5220          !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5221       return 0;
5222     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5223   };
5224 
5225   for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5226     Instruction *I = IdxToInstr[Idx];
5227 
5228     // Remove all of the instructions that end at this location.
5229     InstrList &List = TransposeEnds[Idx];
5230     for (Instruction *ToRemove : List)
5231       OpenIntervals.erase(ToRemove);
5232 
5233     // Ignore instructions that are never used within the loop.
5234     if (!Ends.count(I))
5235       continue;
5236 
5237     // Skip ignored values.
5238     if (ValuesToIgnore.count(I))
5239       continue;
5240 
5241     collectInLoopReductions();
5242 
5243     // For each VF find the maximum usage of registers.
5244     for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5245       // Count the number of registers used, per register class, given all open
5246       // intervals.
5247       // Note that elements in this SmallMapVector will be default constructed
5248       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5249       // there is no previous entry for ClassID.
5250       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5251 
5252       if (VFs[J].isScalar()) {
5253         for (auto *Inst : OpenIntervals) {
5254           unsigned ClassID =
5255               TTI.getRegisterClassForType(false, Inst->getType());
5256           // FIXME: The target might use more than one register for the type
5257           // even in the scalar case.
5258           RegUsage[ClassID] += 1;
5259         }
5260       } else {
5261         collectUniformsAndScalars(VFs[J]);
5262         for (auto *Inst : OpenIntervals) {
5263           // Skip ignored values for VF > 1.
5264           if (VecValuesToIgnore.count(Inst))
5265             continue;
5266           if (isScalarAfterVectorization(Inst, VFs[J])) {
5267             unsigned ClassID =
5268                 TTI.getRegisterClassForType(false, Inst->getType());
5269             // FIXME: The target might use more than one register for the type
5270             // even in the scalar case.
5271             RegUsage[ClassID] += 1;
5272           } else {
5273             unsigned ClassID =
5274                 TTI.getRegisterClassForType(true, Inst->getType());
5275             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5276           }
5277         }
5278       }
5279 
5280       for (const auto &Pair : RegUsage) {
5281         auto &Entry = MaxUsages[J][Pair.first];
5282         Entry = std::max(Entry, Pair.second);
5283       }
5284     }
5285 
5286     LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5287                       << OpenIntervals.size() << '\n');
5288 
5289     // Add the current instruction to the list of open intervals.
5290     OpenIntervals.insert(I);
5291   }
5292 
5293   for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5294     // Note that elements in this SmallMapVector will be default constructed
5295     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5296     // there is no previous entry for ClassID.
5297     SmallMapVector<unsigned, unsigned, 4> Invariant;
5298 
5299     for (auto *Inst : LoopInvariants) {
5300       // FIXME: The target might use more than one register for the type
5301       // even in the scalar case.
5302       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5303         auto *I = cast<Instruction>(U);
5304         return TheLoop != LI->getLoopFor(I->getParent()) ||
5305                isScalarAfterVectorization(I, VFs[Idx]);
5306       });
5307 
5308       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5309       unsigned ClassID =
5310           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5311       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5312     }
5313 
5314     LLVM_DEBUG({
5315       dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5316       dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5317              << " item\n";
5318       for (const auto &pair : MaxUsages[Idx]) {
5319         dbgs() << "LV(REG): RegisterClass: "
5320                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5321                << " registers\n";
5322       }
5323       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5324              << " item\n";
5325       for (const auto &pair : Invariant) {
5326         dbgs() << "LV(REG): RegisterClass: "
5327                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5328                << " registers\n";
5329       }
5330     });
5331 
5332     RU.LoopInvariantRegs = Invariant;
5333     RU.MaxLocalUsers = MaxUsages[Idx];
5334     RUs[Idx] = RU;
5335   }
5336 
5337   return RUs;
5338 }
5339 
5340 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5341                                                            ElementCount VF) {
5342   // TODO: Cost model for emulated masked load/store is completely
5343   // broken. This hack guides the cost model to use an artificially
5344   // high enough value to practically disable vectorization with such
5345   // operations, except where previously deployed legality hack allowed
5346   // using very low cost values. This is to avoid regressions coming simply
5347   // from moving "masked load/store" check from legality to cost model.
5348   // Masked Load/Gather emulation was previously never allowed.
5349   // Limited number of Masked Store/Scatter emulation was allowed.
5350   assert((isPredicatedInst(I)) &&
5351          "Expecting a scalar emulated instruction");
5352   return isa<LoadInst>(I) ||
5353          (isa<StoreInst>(I) &&
5354           NumPredStores > NumberOfStoresToPredicate);
5355 }
5356 
5357 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5358   // If we aren't vectorizing the loop, or if we've already collected the
5359   // instructions to scalarize, there's nothing to do. Collection may already
5360   // have occurred if we have a user-selected VF and are now computing the
5361   // expected cost for interleaving.
5362   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5363     return;
5364 
5365   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5366   // not profitable to scalarize any instructions, the presence of VF in the
5367   // map will indicate that we've analyzed it already.
5368   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5369 
5370   PredicatedBBsAfterVectorization[VF].clear();
5371 
5372   // Find all the instructions that are scalar with predication in the loop and
5373   // determine if it would be better to not if-convert the blocks they are in.
5374   // If so, we also record the instructions to scalarize.
5375   for (BasicBlock *BB : TheLoop->blocks()) {
5376     if (!blockNeedsPredicationForAnyReason(BB))
5377       continue;
5378     for (Instruction &I : *BB)
5379       if (isScalarWithPredication(&I, VF)) {
5380         ScalarCostsTy ScalarCosts;
5381         // Do not apply discount logic for:
5382         // 1. Scalars after vectorization, as there will only be a single copy
5383         // of the instruction.
5384         // 2. Scalable VF, as that would lead to invalid scalarization costs.
5385         // 3. Emulated masked memrefs, if a hacked cost is needed.
5386         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5387             !useEmulatedMaskMemRefHack(&I, VF) &&
5388             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5389           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5390           // Check if we decided to scalarize a call. If so, update the widening
5391           // decision of the call to CM_Scalarize with the computed scalar cost.
5392           for (const auto &[I, _] : ScalarCosts) {
5393             auto *CI = dyn_cast<CallInst>(I);
5394             if (!CI || !CallWideningDecisions.contains({CI, VF}))
5395               continue;
5396             CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5397             CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5398           }
5399         }
5400         // Remember that BB will remain after vectorization.
5401         PredicatedBBsAfterVectorization[VF].insert(BB);
5402         for (auto *Pred : predecessors(BB)) {
5403           if (Pred->getSingleSuccessor() == BB)
5404             PredicatedBBsAfterVectorization[VF].insert(Pred);
5405         }
5406       }
5407   }
5408 }
5409 
5410 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5411     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5412   assert(!isUniformAfterVectorization(PredInst, VF) &&
5413          "Instruction marked uniform-after-vectorization will be predicated");
5414 
5415   // Initialize the discount to zero, meaning that the scalar version and the
5416   // vector version cost the same.
5417   InstructionCost Discount = 0;
5418 
5419   // Holds instructions to analyze. The instructions we visit are mapped in
5420   // ScalarCosts. Those instructions are the ones that would be scalarized if
5421   // we find that the scalar version costs less.
5422   SmallVector<Instruction *, 8> Worklist;
5423 
5424   // Returns true if the given instruction can be scalarized.
5425   auto CanBeScalarized = [&](Instruction *I) -> bool {
5426     // We only attempt to scalarize instructions forming a single-use chain
5427     // from the original predicated block that would otherwise be vectorized.
5428     // Although not strictly necessary, we give up on instructions we know will
5429     // already be scalar to avoid traversing chains that are unlikely to be
5430     // beneficial.
5431     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5432         isScalarAfterVectorization(I, VF))
5433       return false;
5434 
5435     // If the instruction is scalar with predication, it will be analyzed
5436     // separately. We ignore it within the context of PredInst.
5437     if (isScalarWithPredication(I, VF))
5438       return false;
5439 
5440     // If any of the instruction's operands are uniform after vectorization,
5441     // the instruction cannot be scalarized. This prevents, for example, a
5442     // masked load from being scalarized.
5443     //
5444     // We assume we will only emit a value for lane zero of an instruction
5445     // marked uniform after vectorization, rather than VF identical values.
5446     // Thus, if we scalarize an instruction that uses a uniform, we would
5447     // create uses of values corresponding to the lanes we aren't emitting code
5448     // for. This behavior can be changed by allowing getScalarValue to clone
5449     // the lane zero values for uniforms rather than asserting.
5450     for (Use &U : I->operands())
5451       if (auto *J = dyn_cast<Instruction>(U.get()))
5452         if (isUniformAfterVectorization(J, VF))
5453           return false;
5454 
5455     // Otherwise, we can scalarize the instruction.
5456     return true;
5457   };
5458 
5459   // Compute the expected cost discount from scalarizing the entire expression
5460   // feeding the predicated instruction. We currently only consider expressions
5461   // that are single-use instruction chains.
5462   Worklist.push_back(PredInst);
5463   while (!Worklist.empty()) {
5464     Instruction *I = Worklist.pop_back_val();
5465 
5466     // If we've already analyzed the instruction, there's nothing to do.
5467     if (ScalarCosts.contains(I))
5468       continue;
5469 
5470     // Compute the cost of the vector instruction. Note that this cost already
5471     // includes the scalarization overhead of the predicated instruction.
5472     InstructionCost VectorCost = getInstructionCost(I, VF);
5473 
5474     // Compute the cost of the scalarized instruction. This cost is the cost of
5475     // the instruction as if it wasn't if-converted and instead remained in the
5476     // predicated block. We will scale this cost by block probability after
5477     // computing the scalarization overhead.
5478     InstructionCost ScalarCost =
5479         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5480 
5481     // Compute the scalarization overhead of needed insertelement instructions
5482     // and phi nodes.
5483     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5484       ScalarCost += TTI.getScalarizationOverhead(
5485           cast<VectorType>(toVectorTy(I->getType(), VF)),
5486           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5487           /*Extract*/ false, CostKind);
5488       ScalarCost +=
5489           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5490     }
5491 
5492     // Compute the scalarization overhead of needed extractelement
5493     // instructions. For each of the instruction's operands, if the operand can
5494     // be scalarized, add it to the worklist; otherwise, account for the
5495     // overhead.
5496     for (Use &U : I->operands())
5497       if (auto *J = dyn_cast<Instruction>(U.get())) {
5498         assert(VectorType::isValidElementType(J->getType()) &&
5499                "Instruction has non-scalar type");
5500         if (CanBeScalarized(J))
5501           Worklist.push_back(J);
5502         else if (needsExtract(J, VF)) {
5503           ScalarCost += TTI.getScalarizationOverhead(
5504               cast<VectorType>(toVectorTy(J->getType(), VF)),
5505               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5506               /*Extract*/ true, CostKind);
5507         }
5508       }
5509 
5510     // Scale the total scalar cost by block probability.
5511     ScalarCost /= getReciprocalPredBlockProb();
5512 
5513     // Compute the discount. A non-negative discount means the vector version
5514     // of the instruction costs more, and scalarizing would be beneficial.
5515     Discount += VectorCost - ScalarCost;
5516     ScalarCosts[I] = ScalarCost;
5517   }
5518 
5519   return Discount;
5520 }
5521 
5522 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5523   InstructionCost Cost;
5524 
5525   // If the vector loop gets executed exactly once with the given VF, ignore the
5526   // costs of comparison and induction instructions, as they'll get simplified
5527   // away.
5528   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5529   auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5530   if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5531     addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5532                                          ValuesToIgnoreForVF);
5533 
5534   // For each block.
5535   for (BasicBlock *BB : TheLoop->blocks()) {
5536     InstructionCost BlockCost;
5537 
5538     // For each instruction in the old loop.
5539     for (Instruction &I : BB->instructionsWithoutDebug()) {
5540       // Skip ignored values.
5541       if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5542           (VF.isVector() && VecValuesToIgnore.count(&I)))
5543         continue;
5544 
5545       InstructionCost C = getInstructionCost(&I, VF);
5546 
5547       // Check if we should override the cost.
5548       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5549         C = InstructionCost(ForceTargetInstructionCost);
5550 
5551       BlockCost += C;
5552       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5553                         << VF << " For instruction: " << I << '\n');
5554     }
5555 
5556     // If we are vectorizing a predicated block, it will have been
5557     // if-converted. This means that the block's instructions (aside from
5558     // stores and instructions that may divide by zero) will now be
5559     // unconditionally executed. For the scalar case, we may not always execute
5560     // the predicated block, if it is an if-else block. Thus, scale the block's
5561     // cost by the probability of executing it. blockNeedsPredication from
5562     // Legal is used so as to not include all blocks in tail folded loops.
5563     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5564       BlockCost /= getReciprocalPredBlockProb();
5565 
5566     Cost += BlockCost;
5567   }
5568 
5569   return Cost;
5570 }
5571 
5572 /// Gets Address Access SCEV after verifying that the access pattern
5573 /// is loop invariant except the induction variable dependence.
5574 ///
5575 /// This SCEV can be sent to the Target in order to estimate the address
5576 /// calculation cost.
5577 static const SCEV *getAddressAccessSCEV(
5578               Value *Ptr,
5579               LoopVectorizationLegality *Legal,
5580               PredicatedScalarEvolution &PSE,
5581               const Loop *TheLoop) {
5582 
5583   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5584   if (!Gep)
5585     return nullptr;
5586 
5587   // We are looking for a gep with all loop invariant indices except for one
5588   // which should be an induction variable.
5589   auto *SE = PSE.getSE();
5590   unsigned NumOperands = Gep->getNumOperands();
5591   for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5592     Value *Opd = Gep->getOperand(Idx);
5593     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5594         !Legal->isInductionVariable(Opd))
5595       return nullptr;
5596   }
5597 
5598   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5599   return PSE.getSCEV(Ptr);
5600 }
5601 
5602 InstructionCost
5603 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5604                                                         ElementCount VF) {
5605   assert(VF.isVector() &&
5606          "Scalarization cost of instruction implies vectorization.");
5607   if (VF.isScalable())
5608     return InstructionCost::getInvalid();
5609 
5610   Type *ValTy = getLoadStoreType(I);
5611   auto *SE = PSE.getSE();
5612 
5613   unsigned AS = getLoadStoreAddressSpace(I);
5614   Value *Ptr = getLoadStorePointerOperand(I);
5615   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5616   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5617   //       that it is being called from this specific place.
5618 
5619   // Figure out whether the access is strided and get the stride value
5620   // if it's known in compile time
5621   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5622 
5623   // Get the cost of the scalar memory instruction and address computation.
5624   InstructionCost Cost =
5625       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5626 
5627   // Don't pass *I here, since it is scalar but will actually be part of a
5628   // vectorized loop where the user of it is a vectorized instruction.
5629   const Align Alignment = getLoadStoreAlignment(I);
5630   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5631                                                       ValTy->getScalarType(),
5632                                                       Alignment, AS, CostKind);
5633 
5634   // Get the overhead of the extractelement and insertelement instructions
5635   // we might create due to scalarization.
5636   Cost += getScalarizationOverhead(I, VF);
5637 
5638   // If we have a predicated load/store, it will need extra i1 extracts and
5639   // conditional branches, but may not be executed for each vector lane. Scale
5640   // the cost by the probability of executing the predicated block.
5641   if (isPredicatedInst(I)) {
5642     Cost /= getReciprocalPredBlockProb();
5643 
5644     // Add the cost of an i1 extract and a branch
5645     auto *VecI1Ty =
5646         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5647     Cost += TTI.getScalarizationOverhead(
5648         VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5649         /*Insert=*/false, /*Extract=*/true, CostKind);
5650     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5651 
5652     if (useEmulatedMaskMemRefHack(I, VF))
5653       // Artificially setting to a high enough value to practically disable
5654       // vectorization with such operations.
5655       Cost = 3000000;
5656   }
5657 
5658   return Cost;
5659 }
5660 
5661 InstructionCost
5662 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5663                                                     ElementCount VF) {
5664   Type *ValTy = getLoadStoreType(I);
5665   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5666   Value *Ptr = getLoadStorePointerOperand(I);
5667   unsigned AS = getLoadStoreAddressSpace(I);
5668   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5669 
5670   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5671          "Stride should be 1 or -1 for consecutive memory access");
5672   const Align Alignment = getLoadStoreAlignment(I);
5673   InstructionCost Cost = 0;
5674   if (Legal->isMaskRequired(I)) {
5675     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5676                                       CostKind);
5677   } else {
5678     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5679     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5680                                 CostKind, OpInfo, I);
5681   }
5682 
5683   bool Reverse = ConsecutiveStride < 0;
5684   if (Reverse)
5685     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5686                                CostKind, 0);
5687   return Cost;
5688 }
5689 
5690 InstructionCost
5691 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5692                                                 ElementCount VF) {
5693   assert(Legal->isUniformMemOp(*I, VF));
5694 
5695   Type *ValTy = getLoadStoreType(I);
5696   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5697   const Align Alignment = getLoadStoreAlignment(I);
5698   unsigned AS = getLoadStoreAddressSpace(I);
5699   if (isa<LoadInst>(I)) {
5700     return TTI.getAddressComputationCost(ValTy) +
5701            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5702                                CostKind) +
5703            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
5704                               CostKind);
5705   }
5706   StoreInst *SI = cast<StoreInst>(I);
5707 
5708   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5709   return TTI.getAddressComputationCost(ValTy) +
5710          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5711                              CostKind) +
5712          (IsLoopInvariantStoreValue
5713               ? 0
5714               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5715                                        CostKind, VF.getKnownMinValue() - 1));
5716 }
5717 
5718 InstructionCost
5719 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5720                                                  ElementCount VF) {
5721   Type *ValTy = getLoadStoreType(I);
5722   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5723   const Align Alignment = getLoadStoreAlignment(I);
5724   const Value *Ptr = getLoadStorePointerOperand(I);
5725 
5726   return TTI.getAddressComputationCost(VectorTy) +
5727          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5728                                     Legal->isMaskRequired(I), Alignment,
5729                                     CostKind, I);
5730 }
5731 
5732 InstructionCost
5733 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5734                                                    ElementCount VF) {
5735   const auto *Group = getInterleavedAccessGroup(I);
5736   assert(Group && "Fail to get an interleaved access group.");
5737 
5738   Instruction *InsertPos = Group->getInsertPos();
5739   Type *ValTy = getLoadStoreType(InsertPos);
5740   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5741   unsigned AS = getLoadStoreAddressSpace(InsertPos);
5742 
5743   unsigned InterleaveFactor = Group->getFactor();
5744   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5745 
5746   // Holds the indices of existing members in the interleaved group.
5747   SmallVector<unsigned, 4> Indices;
5748   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5749     if (Group->getMember(IF))
5750       Indices.push_back(IF);
5751 
5752   // Calculate the cost of the whole interleaved group.
5753   bool UseMaskForGaps =
5754       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5755       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5756   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5757       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5758       Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5759       UseMaskForGaps);
5760 
5761   if (Group->isReverse()) {
5762     // TODO: Add support for reversed masked interleaved access.
5763     assert(!Legal->isMaskRequired(I) &&
5764            "Reverse masked interleaved access not supported.");
5765     Cost += Group->getNumMembers() *
5766             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5767                                CostKind, 0);
5768   }
5769   return Cost;
5770 }
5771 
5772 std::optional<InstructionCost>
5773 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5774                                                     ElementCount VF,
5775                                                     Type *Ty) const {
5776   using namespace llvm::PatternMatch;
5777   // Early exit for no inloop reductions
5778   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5779     return std::nullopt;
5780   auto *VectorTy = cast<VectorType>(Ty);
5781 
5782   // We are looking for a pattern of, and finding the minimal acceptable cost:
5783   //  reduce(mul(ext(A), ext(B))) or
5784   //  reduce(mul(A, B)) or
5785   //  reduce(ext(A)) or
5786   //  reduce(A).
5787   // The basic idea is that we walk down the tree to do that, finding the root
5788   // reduction instruction in InLoopReductionImmediateChains. From there we find
5789   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5790   // of the components. If the reduction cost is lower then we return it for the
5791   // reduction instruction and 0 for the other instructions in the pattern. If
5792   // it is not we return an invalid cost specifying the orignal cost method
5793   // should be used.
5794   Instruction *RetI = I;
5795   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5796     if (!RetI->hasOneUser())
5797       return std::nullopt;
5798     RetI = RetI->user_back();
5799   }
5800 
5801   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5802       RetI->user_back()->getOpcode() == Instruction::Add) {
5803     RetI = RetI->user_back();
5804   }
5805 
5806   // Test if the found instruction is a reduction, and if not return an invalid
5807   // cost specifying the parent to use the original cost modelling.
5808   if (!InLoopReductionImmediateChains.count(RetI))
5809     return std::nullopt;
5810 
5811   // Find the reduction this chain is a part of and calculate the basic cost of
5812   // the reduction on its own.
5813   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5814   Instruction *ReductionPhi = LastChain;
5815   while (!isa<PHINode>(ReductionPhi))
5816     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5817 
5818   const RecurrenceDescriptor &RdxDesc =
5819       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5820 
5821   InstructionCost BaseCost;
5822   RecurKind RK = RdxDesc.getRecurrenceKind();
5823   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5824     Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5825     BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5826                                           RdxDesc.getFastMathFlags(), CostKind);
5827   } else {
5828     BaseCost = TTI.getArithmeticReductionCost(
5829         RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5830   }
5831 
5832   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5833   // normal fmul instruction to the cost of the fadd reduction.
5834   if (RK == RecurKind::FMulAdd)
5835     BaseCost +=
5836         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5837 
5838   // If we're using ordered reductions then we can just return the base cost
5839   // here, since getArithmeticReductionCost calculates the full ordered
5840   // reduction cost when FP reassociation is not allowed.
5841   if (useOrderedReductions(RdxDesc))
5842     return BaseCost;
5843 
5844   // Get the operand that was not the reduction chain and match it to one of the
5845   // patterns, returning the better cost if it is found.
5846   Instruction *RedOp = RetI->getOperand(1) == LastChain
5847                            ? dyn_cast<Instruction>(RetI->getOperand(0))
5848                            : dyn_cast<Instruction>(RetI->getOperand(1));
5849 
5850   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5851 
5852   Instruction *Op0, *Op1;
5853   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5854       match(RedOp,
5855             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5856       match(Op0, m_ZExtOrSExt(m_Value())) &&
5857       Op0->getOpcode() == Op1->getOpcode() &&
5858       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5859       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5860       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5861 
5862     // Matched reduce.add(ext(mul(ext(A), ext(B)))
5863     // Note that the extend opcodes need to all match, or if A==B they will have
5864     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5865     // which is equally fine.
5866     bool IsUnsigned = isa<ZExtInst>(Op0);
5867     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5868     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5869 
5870     InstructionCost ExtCost =
5871         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5872                              TTI::CastContextHint::None, CostKind, Op0);
5873     InstructionCost MulCost =
5874         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5875     InstructionCost Ext2Cost =
5876         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5877                              TTI::CastContextHint::None, CostKind, RedOp);
5878 
5879     InstructionCost RedCost = TTI.getMulAccReductionCost(
5880         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5881 
5882     if (RedCost.isValid() &&
5883         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5884       return I == RetI ? RedCost : 0;
5885   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5886              !TheLoop->isLoopInvariant(RedOp)) {
5887     // Matched reduce(ext(A))
5888     bool IsUnsigned = isa<ZExtInst>(RedOp);
5889     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5890     InstructionCost RedCost = TTI.getExtendedReductionCost(
5891         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5892         RdxDesc.getFastMathFlags(), CostKind);
5893 
5894     InstructionCost ExtCost =
5895         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5896                              TTI::CastContextHint::None, CostKind, RedOp);
5897     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5898       return I == RetI ? RedCost : 0;
5899   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5900              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5901     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5902         Op0->getOpcode() == Op1->getOpcode() &&
5903         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5904       bool IsUnsigned = isa<ZExtInst>(Op0);
5905       Type *Op0Ty = Op0->getOperand(0)->getType();
5906       Type *Op1Ty = Op1->getOperand(0)->getType();
5907       Type *LargestOpTy =
5908           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5909                                                                     : Op0Ty;
5910       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5911 
5912       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5913       // different sizes. We take the largest type as the ext to reduce, and add
5914       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5915       InstructionCost ExtCost0 = TTI.getCastInstrCost(
5916           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5917           TTI::CastContextHint::None, CostKind, Op0);
5918       InstructionCost ExtCost1 = TTI.getCastInstrCost(
5919           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5920           TTI::CastContextHint::None, CostKind, Op1);
5921       InstructionCost MulCost =
5922           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5923 
5924       InstructionCost RedCost = TTI.getMulAccReductionCost(
5925           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5926       InstructionCost ExtraExtCost = 0;
5927       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5928         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5929         ExtraExtCost = TTI.getCastInstrCost(
5930             ExtraExtOp->getOpcode(), ExtType,
5931             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5932             TTI::CastContextHint::None, CostKind, ExtraExtOp);
5933       }
5934 
5935       if (RedCost.isValid() &&
5936           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5937         return I == RetI ? RedCost : 0;
5938     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5939       // Matched reduce.add(mul())
5940       InstructionCost MulCost =
5941           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5942 
5943       InstructionCost RedCost = TTI.getMulAccReductionCost(
5944           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5945 
5946       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5947         return I == RetI ? RedCost : 0;
5948     }
5949   }
5950 
5951   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5952 }
5953 
5954 InstructionCost
5955 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5956                                                      ElementCount VF) {
5957   // Calculate scalar cost only. Vectorization cost should be ready at this
5958   // moment.
5959   if (VF.isScalar()) {
5960     Type *ValTy = getLoadStoreType(I);
5961     const Align Alignment = getLoadStoreAlignment(I);
5962     unsigned AS = getLoadStoreAddressSpace(I);
5963 
5964     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5965     return TTI.getAddressComputationCost(ValTy) +
5966            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5967                                OpInfo, I);
5968   }
5969   return getWideningCost(I, VF);
5970 }
5971 
5972 InstructionCost
5973 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5974                                                      ElementCount VF) const {
5975 
5976   // There is no mechanism yet to create a scalable scalarization loop,
5977   // so this is currently Invalid.
5978   if (VF.isScalable())
5979     return InstructionCost::getInvalid();
5980 
5981   if (VF.isScalar())
5982     return 0;
5983 
5984   InstructionCost Cost = 0;
5985   Type *RetTy = toVectorTy(I->getType(), VF);
5986   if (!RetTy->isVoidTy() &&
5987       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5988     Cost += TTI.getScalarizationOverhead(
5989         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
5990         /*Insert*/ true,
5991         /*Extract*/ false, CostKind);
5992 
5993   // Some targets keep addresses scalar.
5994   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5995     return Cost;
5996 
5997   // Some targets support efficient element stores.
5998   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5999     return Cost;
6000 
6001   // Collect operands to consider.
6002   CallInst *CI = dyn_cast<CallInst>(I);
6003   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6004 
6005   // Skip operands that do not require extraction/scalarization and do not incur
6006   // any overhead.
6007   SmallVector<Type *> Tys;
6008   for (auto *V : filterExtractingOperands(Ops, VF))
6009     Tys.push_back(maybeVectorizeType(V->getType(), VF));
6010   return Cost + TTI.getOperandsScalarizationOverhead(
6011                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6012 }
6013 
6014 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6015   if (VF.isScalar())
6016     return;
6017   NumPredStores = 0;
6018   for (BasicBlock *BB : TheLoop->blocks()) {
6019     // For each instruction in the old loop.
6020     for (Instruction &I : *BB) {
6021       Value *Ptr =  getLoadStorePointerOperand(&I);
6022       if (!Ptr)
6023         continue;
6024 
6025       // TODO: We should generate better code and update the cost model for
6026       // predicated uniform stores. Today they are treated as any other
6027       // predicated store (see added test cases in
6028       // invariant-store-vectorization.ll).
6029       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6030         NumPredStores++;
6031 
6032       if (Legal->isUniformMemOp(I, VF)) {
6033         auto IsLegalToScalarize = [&]() {
6034           if (!VF.isScalable())
6035             // Scalarization of fixed length vectors "just works".
6036             return true;
6037 
6038           // We have dedicated lowering for unpredicated uniform loads and
6039           // stores.  Note that even with tail folding we know that at least
6040           // one lane is active (i.e. generalized predication is not possible
6041           // here), and the logic below depends on this fact.
6042           if (!foldTailByMasking())
6043             return true;
6044 
6045           // For scalable vectors, a uniform memop load is always
6046           // uniform-by-parts  and we know how to scalarize that.
6047           if (isa<LoadInst>(I))
6048             return true;
6049 
6050           // A uniform store isn't neccessarily uniform-by-part
6051           // and we can't assume scalarization.
6052           auto &SI = cast<StoreInst>(I);
6053           return TheLoop->isLoopInvariant(SI.getValueOperand());
6054         };
6055 
6056         const InstructionCost GatherScatterCost =
6057           isLegalGatherOrScatter(&I, VF) ?
6058           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6059 
6060         // Load: Scalar load + broadcast
6061         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6062         // FIXME: This cost is a significant under-estimate for tail folded
6063         // memory ops.
6064         const InstructionCost ScalarizationCost =
6065             IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6066                                  : InstructionCost::getInvalid();
6067 
6068         // Choose better solution for the current VF,  Note that Invalid
6069         // costs compare as maximumal large.  If both are invalid, we get
6070         // scalable invalid which signals a failure and a vectorization abort.
6071         if (GatherScatterCost < ScalarizationCost)
6072           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6073         else
6074           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6075         continue;
6076       }
6077 
6078       // We assume that widening is the best solution when possible.
6079       if (memoryInstructionCanBeWidened(&I, VF)) {
6080         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6081         int ConsecutiveStride = Legal->isConsecutivePtr(
6082             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6083         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6084                "Expected consecutive stride.");
6085         InstWidening Decision =
6086             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6087         setWideningDecision(&I, VF, Decision, Cost);
6088         continue;
6089       }
6090 
6091       // Choose between Interleaving, Gather/Scatter or Scalarization.
6092       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6093       unsigned NumAccesses = 1;
6094       if (isAccessInterleaved(&I)) {
6095         const auto *Group = getInterleavedAccessGroup(&I);
6096         assert(Group && "Fail to get an interleaved access group.");
6097 
6098         // Make one decision for the whole group.
6099         if (getWideningDecision(&I, VF) != CM_Unknown)
6100           continue;
6101 
6102         NumAccesses = Group->getNumMembers();
6103         if (interleavedAccessCanBeWidened(&I, VF))
6104           InterleaveCost = getInterleaveGroupCost(&I, VF);
6105       }
6106 
6107       InstructionCost GatherScatterCost =
6108           isLegalGatherOrScatter(&I, VF)
6109               ? getGatherScatterCost(&I, VF) * NumAccesses
6110               : InstructionCost::getInvalid();
6111 
6112       InstructionCost ScalarizationCost =
6113           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6114 
6115       // Choose better solution for the current VF,
6116       // write down this decision and use it during vectorization.
6117       InstructionCost Cost;
6118       InstWidening Decision;
6119       if (InterleaveCost <= GatherScatterCost &&
6120           InterleaveCost < ScalarizationCost) {
6121         Decision = CM_Interleave;
6122         Cost = InterleaveCost;
6123       } else if (GatherScatterCost < ScalarizationCost) {
6124         Decision = CM_GatherScatter;
6125         Cost = GatherScatterCost;
6126       } else {
6127         Decision = CM_Scalarize;
6128         Cost = ScalarizationCost;
6129       }
6130       // If the instructions belongs to an interleave group, the whole group
6131       // receives the same decision. The whole group receives the cost, but
6132       // the cost will actually be assigned to one instruction.
6133       if (const auto *Group = getInterleavedAccessGroup(&I))
6134         setWideningDecision(Group, VF, Decision, Cost);
6135       else
6136         setWideningDecision(&I, VF, Decision, Cost);
6137     }
6138   }
6139 
6140   // Make sure that any load of address and any other address computation
6141   // remains scalar unless there is gather/scatter support. This avoids
6142   // inevitable extracts into address registers, and also has the benefit of
6143   // activating LSR more, since that pass can't optimize vectorized
6144   // addresses.
6145   if (TTI.prefersVectorizedAddressing())
6146     return;
6147 
6148   // Start with all scalar pointer uses.
6149   SmallPtrSet<Instruction *, 8> AddrDefs;
6150   for (BasicBlock *BB : TheLoop->blocks())
6151     for (Instruction &I : *BB) {
6152       Instruction *PtrDef =
6153         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6154       if (PtrDef && TheLoop->contains(PtrDef) &&
6155           getWideningDecision(&I, VF) != CM_GatherScatter)
6156         AddrDefs.insert(PtrDef);
6157     }
6158 
6159   // Add all instructions used to generate the addresses.
6160   SmallVector<Instruction *, 4> Worklist;
6161   append_range(Worklist, AddrDefs);
6162   while (!Worklist.empty()) {
6163     Instruction *I = Worklist.pop_back_val();
6164     for (auto &Op : I->operands())
6165       if (auto *InstOp = dyn_cast<Instruction>(Op))
6166         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6167             AddrDefs.insert(InstOp).second)
6168           Worklist.push_back(InstOp);
6169   }
6170 
6171   for (auto *I : AddrDefs) {
6172     if (isa<LoadInst>(I)) {
6173       // Setting the desired widening decision should ideally be handled in
6174       // by cost functions, but since this involves the task of finding out
6175       // if the loaded register is involved in an address computation, it is
6176       // instead changed here when we know this is the case.
6177       InstWidening Decision = getWideningDecision(I, VF);
6178       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6179         // Scalarize a widened load of address.
6180         setWideningDecision(
6181             I, VF, CM_Scalarize,
6182             (VF.getKnownMinValue() *
6183              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6184       else if (const auto *Group = getInterleavedAccessGroup(I)) {
6185         // Scalarize an interleave group of address loads.
6186         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6187           if (Instruction *Member = Group->getMember(I))
6188             setWideningDecision(
6189                 Member, VF, CM_Scalarize,
6190                 (VF.getKnownMinValue() *
6191                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6192         }
6193       }
6194     } else
6195       // Make sure I gets scalarized and a cost estimate without
6196       // scalarization overhead.
6197       ForcedScalars[VF].insert(I);
6198   }
6199 }
6200 
6201 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6202   assert(!VF.isScalar() &&
6203          "Trying to set a vectorization decision for a scalar VF");
6204 
6205   auto ForcedScalar = ForcedScalars.find(VF);
6206   for (BasicBlock *BB : TheLoop->blocks()) {
6207     // For each instruction in the old loop.
6208     for (Instruction &I : *BB) {
6209       CallInst *CI = dyn_cast<CallInst>(&I);
6210 
6211       if (!CI)
6212         continue;
6213 
6214       InstructionCost ScalarCost = InstructionCost::getInvalid();
6215       InstructionCost VectorCost = InstructionCost::getInvalid();
6216       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6217       Function *ScalarFunc = CI->getCalledFunction();
6218       Type *ScalarRetTy = CI->getType();
6219       SmallVector<Type *, 4> Tys, ScalarTys;
6220       for (auto &ArgOp : CI->args())
6221         ScalarTys.push_back(ArgOp->getType());
6222 
6223       // Estimate cost of scalarized vector call. The source operands are
6224       // assumed to be vectors, so we need to extract individual elements from
6225       // there, execute VF scalar calls, and then gather the result into the
6226       // vector return value.
6227       InstructionCost ScalarCallCost =
6228           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6229 
6230       // Compute costs of unpacking argument values for the scalar calls and
6231       // packing the return values to a vector.
6232       InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
6233 
6234       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6235       // Honor ForcedScalars and UniformAfterVectorization decisions.
6236       // TODO: For calls, it might still be more profitable to widen. Use
6237       // VPlan-based cost model to compare different options.
6238       if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6239                              ForcedScalar->second.contains(CI)) ||
6240                             isUniformAfterVectorization(CI, VF))) {
6241         setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6242                                 Intrinsic::not_intrinsic, std::nullopt,
6243                                 ScalarCost);
6244         continue;
6245       }
6246 
6247       bool MaskRequired = Legal->isMaskRequired(CI);
6248       // Compute corresponding vector type for return value and arguments.
6249       Type *RetTy = toVectorTy(ScalarRetTy, VF);
6250       for (Type *ScalarTy : ScalarTys)
6251         Tys.push_back(toVectorTy(ScalarTy, VF));
6252 
6253       // An in-loop reduction using an fmuladd intrinsic is a special case;
6254       // we don't want the normal cost for that intrinsic.
6255       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6256         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
6257           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6258                                   getVectorIntrinsicIDForCall(CI, TLI),
6259                                   std::nullopt, *RedCost);
6260           continue;
6261         }
6262 
6263       // Find the cost of vectorizing the call, if we can find a suitable
6264       // vector variant of the function.
6265       bool UsesMask = false;
6266       VFInfo FuncInfo;
6267       Function *VecFunc = nullptr;
6268       // Search through any available variants for one we can use at this VF.
6269       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6270         // Must match requested VF.
6271         if (Info.Shape.VF != VF)
6272           continue;
6273 
6274         // Must take a mask argument if one is required
6275         if (MaskRequired && !Info.isMasked())
6276           continue;
6277 
6278         // Check that all parameter kinds are supported
6279         bool ParamsOk = true;
6280         for (VFParameter Param : Info.Shape.Parameters) {
6281           switch (Param.ParamKind) {
6282           case VFParamKind::Vector:
6283             break;
6284           case VFParamKind::OMP_Uniform: {
6285             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6286             // Make sure the scalar parameter in the loop is invariant.
6287             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6288                                               TheLoop))
6289               ParamsOk = false;
6290             break;
6291           }
6292           case VFParamKind::OMP_Linear: {
6293             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6294             // Find the stride for the scalar parameter in this loop and see if
6295             // it matches the stride for the variant.
6296             // TODO: do we need to figure out the cost of an extract to get the
6297             // first lane? Or do we hope that it will be folded away?
6298             ScalarEvolution *SE = PSE.getSE();
6299             const auto *SAR =
6300                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6301 
6302             if (!SAR || SAR->getLoop() != TheLoop) {
6303               ParamsOk = false;
6304               break;
6305             }
6306 
6307             const SCEVConstant *Step =
6308                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6309 
6310             if (!Step ||
6311                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6312               ParamsOk = false;
6313 
6314             break;
6315           }
6316           case VFParamKind::GlobalPredicate:
6317             UsesMask = true;
6318             break;
6319           default:
6320             ParamsOk = false;
6321             break;
6322           }
6323         }
6324 
6325         if (!ParamsOk)
6326           continue;
6327 
6328         // Found a suitable candidate, stop here.
6329         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6330         FuncInfo = Info;
6331         break;
6332       }
6333 
6334       // Add in the cost of synthesizing a mask if one wasn't required.
6335       InstructionCost MaskCost = 0;
6336       if (VecFunc && UsesMask && !MaskRequired)
6337         MaskCost = TTI.getShuffleCost(
6338             TargetTransformInfo::SK_Broadcast,
6339             VectorType::get(IntegerType::getInt1Ty(
6340                                 VecFunc->getFunctionType()->getContext()),
6341                             VF),
6342             {}, CostKind);
6343 
6344       if (TLI && VecFunc && !CI->isNoBuiltin())
6345         VectorCost =
6346             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6347 
6348       // Find the cost of an intrinsic; some targets may have instructions that
6349       // perform the operation without needing an actual call.
6350       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6351       if (IID != Intrinsic::not_intrinsic)
6352         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6353 
6354       InstructionCost Cost = ScalarCost;
6355       InstWidening Decision = CM_Scalarize;
6356 
6357       if (VectorCost <= Cost) {
6358         Cost = VectorCost;
6359         Decision = CM_VectorCall;
6360       }
6361 
6362       if (IntrinsicCost <= Cost) {
6363         Cost = IntrinsicCost;
6364         Decision = CM_IntrinsicCall;
6365       }
6366 
6367       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6368                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6369     }
6370   }
6371 }
6372 
6373 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6374   if (!Legal->isInvariant(Op))
6375     return false;
6376   // Consider Op invariant, if it or its operands aren't predicated
6377   // instruction in the loop. In that case, it is not trivially hoistable.
6378   auto *OpI = dyn_cast<Instruction>(Op);
6379   return !OpI || !TheLoop->contains(OpI) ||
6380          (!isPredicatedInst(OpI) &&
6381           (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6382           all_of(OpI->operands(),
6383                  [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6384 }
6385 
6386 InstructionCost
6387 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6388                                                ElementCount VF) {
6389   // If we know that this instruction will remain uniform, check the cost of
6390   // the scalar version.
6391   if (isUniformAfterVectorization(I, VF))
6392     VF = ElementCount::getFixed(1);
6393 
6394   if (VF.isVector() && isProfitableToScalarize(I, VF))
6395     return InstsToScalarize[VF][I];
6396 
6397   // Forced scalars do not have any scalarization overhead.
6398   auto ForcedScalar = ForcedScalars.find(VF);
6399   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6400     auto InstSet = ForcedScalar->second;
6401     if (InstSet.count(I))
6402       return getInstructionCost(I, ElementCount::getFixed(1)) *
6403              VF.getKnownMinValue();
6404   }
6405 
6406   Type *RetTy = I->getType();
6407   if (canTruncateToMinimalBitwidth(I, VF))
6408     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6409   auto *SE = PSE.getSE();
6410 
6411   auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6412                                                 ElementCount VF) -> bool {
6413     if (VF.isScalar())
6414       return true;
6415 
6416     auto Scalarized = InstsToScalarize.find(VF);
6417     assert(Scalarized != InstsToScalarize.end() &&
6418            "VF not yet analyzed for scalarization profitability");
6419     return !Scalarized->second.count(I) &&
6420            llvm::all_of(I->users(), [&](User *U) {
6421              auto *UI = cast<Instruction>(U);
6422              return !Scalarized->second.count(UI);
6423            });
6424   };
6425   (void)HasSingleCopyAfterVectorization;
6426 
6427   Type *VectorTy;
6428   if (isScalarAfterVectorization(I, VF)) {
6429     // With the exception of GEPs and PHIs, after scalarization there should
6430     // only be one copy of the instruction generated in the loop. This is
6431     // because the VF is either 1, or any instructions that need scalarizing
6432     // have already been dealt with by the time we get here. As a result,
6433     // it means we don't have to multiply the instruction cost by VF.
6434     assert(I->getOpcode() == Instruction::GetElementPtr ||
6435            I->getOpcode() == Instruction::PHI ||
6436            (I->getOpcode() == Instruction::BitCast &&
6437             I->getType()->isPointerTy()) ||
6438            HasSingleCopyAfterVectorization(I, VF));
6439     VectorTy = RetTy;
6440   } else
6441     VectorTy = toVectorTy(RetTy, VF);
6442 
6443   if (VF.isVector() && VectorTy->isVectorTy() &&
6444       !TTI.getNumberOfParts(VectorTy))
6445     return InstructionCost::getInvalid();
6446 
6447   // TODO: We need to estimate the cost of intrinsic calls.
6448   switch (I->getOpcode()) {
6449   case Instruction::GetElementPtr:
6450     // We mark this instruction as zero-cost because the cost of GEPs in
6451     // vectorized code depends on whether the corresponding memory instruction
6452     // is scalarized or not. Therefore, we handle GEPs with the memory
6453     // instruction cost.
6454     return 0;
6455   case Instruction::Br: {
6456     // In cases of scalarized and predicated instructions, there will be VF
6457     // predicated blocks in the vectorized loop. Each branch around these
6458     // blocks requires also an extract of its vector compare i1 element.
6459     // Note that the conditional branch from the loop latch will be replaced by
6460     // a single branch controlling the loop, so there is no extra overhead from
6461     // scalarization.
6462     bool ScalarPredicatedBB = false;
6463     BranchInst *BI = cast<BranchInst>(I);
6464     if (VF.isVector() && BI->isConditional() &&
6465         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6466          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6467         BI->getParent() != TheLoop->getLoopLatch())
6468       ScalarPredicatedBB = true;
6469 
6470     if (ScalarPredicatedBB) {
6471       // Not possible to scalarize scalable vector with predicated instructions.
6472       if (VF.isScalable())
6473         return InstructionCost::getInvalid();
6474       // Return cost for branches around scalarized and predicated blocks.
6475       auto *VecI1Ty =
6476           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6477       return (
6478           TTI.getScalarizationOverhead(
6479               VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6480               /*Insert*/ false, /*Extract*/ true, CostKind) +
6481           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6482     }
6483 
6484     if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6485       // The back-edge branch will remain, as will all scalar branches.
6486       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6487 
6488     // This branch will be eliminated by if-conversion.
6489     return 0;
6490     // Note: We currently assume zero cost for an unconditional branch inside
6491     // a predicated block since it will become a fall-through, although we
6492     // may decide in the future to call TTI for all branches.
6493   }
6494   case Instruction::Switch: {
6495     if (VF.isScalar())
6496       return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6497     auto *Switch = cast<SwitchInst>(I);
6498     return Switch->getNumCases() *
6499            TTI.getCmpSelInstrCost(
6500                Instruction::ICmp,
6501                toVectorTy(Switch->getCondition()->getType(), VF),
6502                toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6503                CmpInst::ICMP_EQ, CostKind);
6504   }
6505   case Instruction::PHI: {
6506     auto *Phi = cast<PHINode>(I);
6507 
6508     // First-order recurrences are replaced by vector shuffles inside the loop.
6509     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6510       // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6511       // penultimate value of the recurrence.
6512       // TODO: Consider vscale_range info.
6513       if (VF.isScalable() && VF.getKnownMinValue() == 1)
6514         return InstructionCost::getInvalid();
6515       SmallVector<int> Mask(VF.getKnownMinValue());
6516       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6517       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6518                                 cast<VectorType>(VectorTy), Mask, CostKind,
6519                                 VF.getKnownMinValue() - 1);
6520     }
6521 
6522     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6523     // converted into select instructions. We require N - 1 selects per phi
6524     // node, where N is the number of incoming values.
6525     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6526       Type *ResultTy = Phi->getType();
6527 
6528       // All instructions in an Any-of reduction chain are narrowed to bool.
6529       // Check if that is the case for this phi node.
6530       auto *HeaderUser = cast_if_present<PHINode>(
6531           find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6532             auto *Phi = dyn_cast<PHINode>(U);
6533             if (Phi && Phi->getParent() == TheLoop->getHeader())
6534               return Phi;
6535             return nullptr;
6536           }));
6537       if (HeaderUser) {
6538         auto &ReductionVars = Legal->getReductionVars();
6539         auto Iter = ReductionVars.find(HeaderUser);
6540         if (Iter != ReductionVars.end() &&
6541             RecurrenceDescriptor::isAnyOfRecurrenceKind(
6542                 Iter->second.getRecurrenceKind()))
6543           ResultTy = Type::getInt1Ty(Phi->getContext());
6544       }
6545       return (Phi->getNumIncomingValues() - 1) *
6546              TTI.getCmpSelInstrCost(
6547                  Instruction::Select, toVectorTy(ResultTy, VF),
6548                  toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6549                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6550     }
6551 
6552     // When tail folding with EVL, if the phi is part of an out of loop
6553     // reduction then it will be transformed into a wide vp_merge.
6554     if (VF.isVector() && foldTailWithEVL() &&
6555         Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6556       IntrinsicCostAttributes ICA(
6557           Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6558           {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6559       return TTI.getIntrinsicInstrCost(ICA, CostKind);
6560     }
6561 
6562     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6563   }
6564   case Instruction::UDiv:
6565   case Instruction::SDiv:
6566   case Instruction::URem:
6567   case Instruction::SRem:
6568     if (VF.isVector() && isPredicatedInst(I)) {
6569       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6570       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6571         ScalarCost : SafeDivisorCost;
6572     }
6573     // We've proven all lanes safe to speculate, fall through.
6574     [[fallthrough]];
6575   case Instruction::Add:
6576   case Instruction::Sub: {
6577     auto Info = Legal->getHistogramInfo(I);
6578     if (Info && VF.isVector()) {
6579       const HistogramInfo *HGram = Info.value();
6580       // Assume that a non-constant update value (or a constant != 1) requires
6581       // a multiply, and add that into the cost.
6582       InstructionCost MulCost = TTI::TCC_Free;
6583       ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6584       if (!RHS || RHS->getZExtValue() != 1)
6585         MulCost =
6586             TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6587 
6588       // Find the cost of the histogram operation itself.
6589       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6590       Type *ScalarTy = I->getType();
6591       Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6592       IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6593                                   Type::getVoidTy(I->getContext()),
6594                                   {PtrTy, ScalarTy, MaskTy});
6595 
6596       // Add the costs together with the add/sub operation.
6597       return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6598              TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6599     }
6600     [[fallthrough]];
6601   }
6602   case Instruction::FAdd:
6603   case Instruction::FSub:
6604   case Instruction::Mul:
6605   case Instruction::FMul:
6606   case Instruction::FDiv:
6607   case Instruction::FRem:
6608   case Instruction::Shl:
6609   case Instruction::LShr:
6610   case Instruction::AShr:
6611   case Instruction::And:
6612   case Instruction::Or:
6613   case Instruction::Xor: {
6614     // If we're speculating on the stride being 1, the multiplication may
6615     // fold away.  We can generalize this for all operations using the notion
6616     // of neutral elements.  (TODO)
6617     if (I->getOpcode() == Instruction::Mul &&
6618         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6619          PSE.getSCEV(I->getOperand(1))->isOne()))
6620       return 0;
6621 
6622     // Detect reduction patterns
6623     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6624       return *RedCost;
6625 
6626     // Certain instructions can be cheaper to vectorize if they have a constant
6627     // second vector operand. One example of this are shifts on x86.
6628     Value *Op2 = I->getOperand(1);
6629     if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6630         isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6631       Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6632     }
6633     auto Op2Info = TTI.getOperandInfo(Op2);
6634     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6635         shouldConsiderInvariant(Op2))
6636       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6637 
6638     SmallVector<const Value *, 4> Operands(I->operand_values());
6639     return TTI.getArithmeticInstrCost(
6640         I->getOpcode(), VectorTy, CostKind,
6641         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6642         Op2Info, Operands, I, TLI);
6643   }
6644   case Instruction::FNeg: {
6645     return TTI.getArithmeticInstrCost(
6646         I->getOpcode(), VectorTy, CostKind,
6647         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6648         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6649         I->getOperand(0), I);
6650   }
6651   case Instruction::Select: {
6652     SelectInst *SI = cast<SelectInst>(I);
6653     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6654     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6655 
6656     const Value *Op0, *Op1;
6657     using namespace llvm::PatternMatch;
6658     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6659                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6660       // select x, y, false --> x & y
6661       // select x, true, y --> x | y
6662       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6663       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6664       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6665               Op1->getType()->getScalarSizeInBits() == 1);
6666 
6667       SmallVector<const Value *, 2> Operands{Op0, Op1};
6668       return TTI.getArithmeticInstrCost(
6669           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6670           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6671     }
6672 
6673     Type *CondTy = SI->getCondition()->getType();
6674     if (!ScalarCond)
6675       CondTy = VectorType::get(CondTy, VF);
6676 
6677     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6678     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6679       Pred = Cmp->getPredicate();
6680     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6681                                   CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6682                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6683   }
6684   case Instruction::ICmp:
6685   case Instruction::FCmp: {
6686     Type *ValTy = I->getOperand(0)->getType();
6687 
6688     if (canTruncateToMinimalBitwidth(I, VF)) {
6689       Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6690       (void)Op0AsInstruction;
6691       assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6692               MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6693              "if both the operand and the compare are marked for "
6694              "truncation, they must have the same bitwidth");
6695       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6696     }
6697 
6698     VectorTy = toVectorTy(ValTy, VF);
6699     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6700                                   cast<CmpInst>(I)->getPredicate(), CostKind,
6701                                   {TTI::OK_AnyValue, TTI::OP_None},
6702                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6703   }
6704   case Instruction::Store:
6705   case Instruction::Load: {
6706     ElementCount Width = VF;
6707     if (Width.isVector()) {
6708       InstWidening Decision = getWideningDecision(I, Width);
6709       assert(Decision != CM_Unknown &&
6710              "CM decision should be taken at this point");
6711       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6712         return InstructionCost::getInvalid();
6713       if (Decision == CM_Scalarize)
6714         Width = ElementCount::getFixed(1);
6715     }
6716     VectorTy = toVectorTy(getLoadStoreType(I), Width);
6717     return getMemoryInstructionCost(I, VF);
6718   }
6719   case Instruction::BitCast:
6720     if (I->getType()->isPointerTy())
6721       return 0;
6722     [[fallthrough]];
6723   case Instruction::ZExt:
6724   case Instruction::SExt:
6725   case Instruction::FPToUI:
6726   case Instruction::FPToSI:
6727   case Instruction::FPExt:
6728   case Instruction::PtrToInt:
6729   case Instruction::IntToPtr:
6730   case Instruction::SIToFP:
6731   case Instruction::UIToFP:
6732   case Instruction::Trunc:
6733   case Instruction::FPTrunc: {
6734     // Computes the CastContextHint from a Load/Store instruction.
6735     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6736       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6737              "Expected a load or a store!");
6738 
6739       if (VF.isScalar() || !TheLoop->contains(I))
6740         return TTI::CastContextHint::Normal;
6741 
6742       switch (getWideningDecision(I, VF)) {
6743       case LoopVectorizationCostModel::CM_GatherScatter:
6744         return TTI::CastContextHint::GatherScatter;
6745       case LoopVectorizationCostModel::CM_Interleave:
6746         return TTI::CastContextHint::Interleave;
6747       case LoopVectorizationCostModel::CM_Scalarize:
6748       case LoopVectorizationCostModel::CM_Widen:
6749         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6750                                         : TTI::CastContextHint::Normal;
6751       case LoopVectorizationCostModel::CM_Widen_Reverse:
6752         return TTI::CastContextHint::Reversed;
6753       case LoopVectorizationCostModel::CM_Unknown:
6754         llvm_unreachable("Instr did not go through cost modelling?");
6755       case LoopVectorizationCostModel::CM_VectorCall:
6756       case LoopVectorizationCostModel::CM_IntrinsicCall:
6757         llvm_unreachable_internal("Instr has invalid widening decision");
6758       }
6759 
6760       llvm_unreachable("Unhandled case!");
6761     };
6762 
6763     unsigned Opcode = I->getOpcode();
6764     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6765     // For Trunc, the context is the only user, which must be a StoreInst.
6766     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6767       if (I->hasOneUse())
6768         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6769           CCH = ComputeCCH(Store);
6770     }
6771     // For Z/Sext, the context is the operand, which must be a LoadInst.
6772     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6773              Opcode == Instruction::FPExt) {
6774       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6775         CCH = ComputeCCH(Load);
6776     }
6777 
6778     // We optimize the truncation of induction variables having constant
6779     // integer steps. The cost of these truncations is the same as the scalar
6780     // operation.
6781     if (isOptimizableIVTruncate(I, VF)) {
6782       auto *Trunc = cast<TruncInst>(I);
6783       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6784                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6785     }
6786 
6787     // Detect reduction patterns
6788     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6789       return *RedCost;
6790 
6791     Type *SrcScalarTy = I->getOperand(0)->getType();
6792     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6793     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6794       SrcScalarTy =
6795           IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6796     Type *SrcVecTy =
6797         VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6798 
6799     if (canTruncateToMinimalBitwidth(I, VF)) {
6800       // If the result type is <= the source type, there will be no extend
6801       // after truncating the users to the minimal required bitwidth.
6802       if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6803           (I->getOpcode() == Instruction::ZExt ||
6804            I->getOpcode() == Instruction::SExt))
6805         return 0;
6806     }
6807 
6808     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6809   }
6810   case Instruction::Call:
6811     return getVectorCallCost(cast<CallInst>(I), VF);
6812   case Instruction::ExtractValue:
6813     return TTI.getInstructionCost(I, CostKind);
6814   case Instruction::Alloca:
6815     // We cannot easily widen alloca to a scalable alloca, as
6816     // the result would need to be a vector of pointers.
6817     if (VF.isScalable())
6818       return InstructionCost::getInvalid();
6819     [[fallthrough]];
6820   default:
6821     // This opcode is unknown. Assume that it is the same as 'mul'.
6822     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6823   } // end of switch.
6824 }
6825 
6826 void LoopVectorizationCostModel::collectValuesToIgnore() {
6827   // Ignore ephemeral values.
6828   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6829 
6830   SmallVector<Value *, 4> DeadInterleavePointerOps;
6831   SmallVector<Value *, 4> DeadOps;
6832 
6833   // If a scalar epilogue is required, users outside the loop won't use
6834   // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6835   // that is the case.
6836   bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6837   auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6838     return RequiresScalarEpilogue &&
6839            !TheLoop->contains(cast<Instruction>(U)->getParent());
6840   };
6841 
6842   LoopBlocksDFS DFS(TheLoop);
6843   DFS.perform(LI);
6844   MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6845   for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6846     for (Instruction &I : reverse(*BB)) {
6847       // Find all stores to invariant variables. Since they are going to sink
6848       // outside the loop we do not need calculate cost for them.
6849       StoreInst *SI;
6850       if ((SI = dyn_cast<StoreInst>(&I)) &&
6851           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6852         ValuesToIgnore.insert(&I);
6853         DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6854             SI->getValueOperand());
6855       }
6856 
6857       if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6858         continue;
6859 
6860       // Add instructions that would be trivially dead and are only used by
6861       // values already ignored to DeadOps to seed worklist.
6862       if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6863           all_of(I.users(), [this, IsLiveOutDead](User *U) {
6864             return VecValuesToIgnore.contains(U) ||
6865                    ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6866           }))
6867         DeadOps.push_back(&I);
6868 
6869       // For interleave groups, we only create a pointer for the start of the
6870       // interleave group. Queue up addresses of group members except the insert
6871       // position for further processing.
6872       if (isAccessInterleaved(&I)) {
6873         auto *Group = getInterleavedAccessGroup(&I);
6874         if (Group->getInsertPos() == &I)
6875           continue;
6876         Value *PointerOp = getLoadStorePointerOperand(&I);
6877         DeadInterleavePointerOps.push_back(PointerOp);
6878       }
6879 
6880       // Queue branches for analysis. They are dead, if their successors only
6881       // contain dead instructions.
6882       if (auto *Br = dyn_cast<BranchInst>(&I)) {
6883         if (Br->isConditional())
6884           DeadOps.push_back(&I);
6885       }
6886     }
6887 
6888   // Mark ops feeding interleave group members as free, if they are only used
6889   // by other dead computations.
6890   for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6891     auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6892     if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6893           Instruction *UI = cast<Instruction>(U);
6894           return !VecValuesToIgnore.contains(U) &&
6895                  (!isAccessInterleaved(UI) ||
6896                   getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6897         }))
6898       continue;
6899     VecValuesToIgnore.insert(Op);
6900     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6901   }
6902 
6903   for (const auto &[_, Ops] : DeadInvariantStoreOps) {
6904     for (Value *Op : ArrayRef(Ops).drop_back())
6905       DeadOps.push_back(Op);
6906   }
6907   // Mark ops that would be trivially dead and are only used by ignored
6908   // instructions as free.
6909   BasicBlock *Header = TheLoop->getHeader();
6910 
6911   // Returns true if the block contains only dead instructions. Such blocks will
6912   // be removed by VPlan-to-VPlan transforms and won't be considered by the
6913   // VPlan-based cost model, so skip them in the legacy cost-model as well.
6914   auto IsEmptyBlock = [this](BasicBlock *BB) {
6915     return all_of(*BB, [this](Instruction &I) {
6916       return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6917              (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6918     });
6919   };
6920   for (unsigned I = 0; I != DeadOps.size(); ++I) {
6921     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6922 
6923     // Check if the branch should be considered dead.
6924     if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6925       BasicBlock *ThenBB = Br->getSuccessor(0);
6926       BasicBlock *ElseBB = Br->getSuccessor(1);
6927       // Don't considers branches leaving the loop for simplification.
6928       if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6929         continue;
6930       bool ThenEmpty = IsEmptyBlock(ThenBB);
6931       bool ElseEmpty = IsEmptyBlock(ElseBB);
6932       if ((ThenEmpty && ElseEmpty) ||
6933           (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6934            ElseBB->phis().empty()) ||
6935           (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6936            ThenBB->phis().empty())) {
6937         VecValuesToIgnore.insert(Br);
6938         DeadOps.push_back(Br->getCondition());
6939       }
6940       continue;
6941     }
6942 
6943     // Skip any op that shouldn't be considered dead.
6944     if (!Op || !TheLoop->contains(Op) ||
6945         (isa<PHINode>(Op) && Op->getParent() == Header) ||
6946         !wouldInstructionBeTriviallyDead(Op, TLI) ||
6947         any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6948           return !VecValuesToIgnore.contains(U) &&
6949                  !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6950         }))
6951       continue;
6952 
6953     if (!TheLoop->contains(Op->getParent()))
6954       continue;
6955 
6956     // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6957     // which applies for both scalar and vector versions. Otherwise it is only
6958     // dead in vector versions, so only add it to VecValuesToIgnore.
6959     if (all_of(Op->users(),
6960                [this](User *U) { return ValuesToIgnore.contains(U); }))
6961       ValuesToIgnore.insert(Op);
6962 
6963     VecValuesToIgnore.insert(Op);
6964     DeadOps.append(Op->op_begin(), Op->op_end());
6965   }
6966 
6967   // Ignore type-promoting instructions we identified during reduction
6968   // detection.
6969   for (const auto &Reduction : Legal->getReductionVars()) {
6970     const RecurrenceDescriptor &RedDes = Reduction.second;
6971     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6972     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6973   }
6974   // Ignore type-casting instructions we identified during induction
6975   // detection.
6976   for (const auto &Induction : Legal->getInductionVars()) {
6977     const InductionDescriptor &IndDes = Induction.second;
6978     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6979     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6980   }
6981 }
6982 
6983 void LoopVectorizationCostModel::collectInLoopReductions() {
6984   for (const auto &Reduction : Legal->getReductionVars()) {
6985     PHINode *Phi = Reduction.first;
6986     const RecurrenceDescriptor &RdxDesc = Reduction.second;
6987 
6988     // We don't collect reductions that are type promoted (yet).
6989     if (RdxDesc.getRecurrenceType() != Phi->getType())
6990       continue;
6991 
6992     // If the target would prefer this reduction to happen "in-loop", then we
6993     // want to record it as such.
6994     unsigned Opcode = RdxDesc.getOpcode();
6995     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6996         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6997                                    TargetTransformInfo::ReductionFlags()))
6998       continue;
6999 
7000     // Check that we can correctly put the reductions into the loop, by
7001     // finding the chain of operations that leads from the phi to the loop
7002     // exit value.
7003     SmallVector<Instruction *, 4> ReductionOperations =
7004         RdxDesc.getReductionOpChain(Phi, TheLoop);
7005     bool InLoop = !ReductionOperations.empty();
7006 
7007     if (InLoop) {
7008       InLoopReductions.insert(Phi);
7009       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7010       Instruction *LastChain = Phi;
7011       for (auto *I : ReductionOperations) {
7012         InLoopReductionImmediateChains[I] = LastChain;
7013         LastChain = I;
7014       }
7015     }
7016     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7017                       << " reduction for phi: " << *Phi << "\n");
7018   }
7019 }
7020 
7021 // This function will select a scalable VF if the target supports scalable
7022 // vectors and a fixed one otherwise.
7023 // TODO: we could return a pair of values that specify the max VF and
7024 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7025 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7026 // doesn't have a cost model that can choose which plan to execute if
7027 // more than one is generated.
7028 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7029                                      LoopVectorizationCostModel &CM) {
7030   unsigned WidestType;
7031   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7032 
7033   TargetTransformInfo::RegisterKind RegKind =
7034       TTI.enableScalableVectorization()
7035           ? TargetTransformInfo::RGK_ScalableVector
7036           : TargetTransformInfo::RGK_FixedWidthVector;
7037 
7038   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7039   unsigned N = RegSize.getKnownMinValue() / WidestType;
7040   return ElementCount::get(N, RegSize.isScalable());
7041 }
7042 
7043 VectorizationFactor
7044 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7045   ElementCount VF = UserVF;
7046   // Outer loop handling: They may require CFG and instruction level
7047   // transformations before even evaluating whether vectorization is profitable.
7048   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7049   // the vectorization pipeline.
7050   if (!OrigLoop->isInnermost()) {
7051     // If the user doesn't provide a vectorization factor, determine a
7052     // reasonable one.
7053     if (UserVF.isZero()) {
7054       VF = determineVPlanVF(TTI, CM);
7055       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7056 
7057       // Make sure we have a VF > 1 for stress testing.
7058       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7059         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7060                           << "overriding computed VF.\n");
7061         VF = ElementCount::getFixed(4);
7062       }
7063     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7064                !ForceTargetSupportsScalableVectors) {
7065       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7066                         << "not supported by the target.\n");
7067       reportVectorizationFailure(
7068           "Scalable vectorization requested but not supported by the target",
7069           "the scalable user-specified vectorization width for outer-loop "
7070           "vectorization cannot be used because the target does not support "
7071           "scalable vectors.",
7072           "ScalableVFUnfeasible", ORE, OrigLoop);
7073       return VectorizationFactor::Disabled();
7074     }
7075     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7076     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7077            "VF needs to be a power of two");
7078     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7079                       << "VF " << VF << " to build VPlans.\n");
7080     buildVPlans(VF, VF);
7081 
7082     // For VPlan build stress testing, we bail out after VPlan construction.
7083     if (VPlanBuildStressTest)
7084       return VectorizationFactor::Disabled();
7085 
7086     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7087   }
7088 
7089   LLVM_DEBUG(
7090       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7091                 "VPlan-native path.\n");
7092   return VectorizationFactor::Disabled();
7093 }
7094 
7095 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7096   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7097   CM.collectValuesToIgnore();
7098   CM.collectElementTypesForWidening();
7099 
7100   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7101   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7102     return;
7103 
7104   // Invalidate interleave groups if all blocks of loop will be predicated.
7105   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7106       !useMaskedInterleavedAccesses(TTI)) {
7107     LLVM_DEBUG(
7108         dbgs()
7109         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7110            "which requires masked-interleaved support.\n");
7111     if (CM.InterleaveInfo.invalidateGroups())
7112       // Invalidating interleave groups also requires invalidating all decisions
7113       // based on them, which includes widening decisions and uniform and scalar
7114       // values.
7115       CM.invalidateCostModelingDecisions();
7116   }
7117 
7118   if (CM.foldTailByMasking())
7119     Legal->prepareToFoldTailByMasking();
7120 
7121   ElementCount MaxUserVF =
7122       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7123   if (UserVF) {
7124     if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7125       reportVectorizationInfo(
7126           "UserVF ignored because it may be larger than the maximal safe VF",
7127           "InvalidUserVF", ORE, OrigLoop);
7128     } else {
7129       assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7130              "VF needs to be a power of two");
7131       // Collect the instructions (and their associated costs) that will be more
7132       // profitable to scalarize.
7133       CM.collectInLoopReductions();
7134       if (CM.selectUserVectorizationFactor(UserVF)) {
7135         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7136         buildVPlansWithVPRecipes(UserVF, UserVF);
7137         LLVM_DEBUG(printPlans(dbgs()));
7138         return;
7139       }
7140       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7141                               "InvalidCost", ORE, OrigLoop);
7142     }
7143   }
7144 
7145   // Collect the Vectorization Factor Candidates.
7146   SmallVector<ElementCount> VFCandidates;
7147   for (auto VF = ElementCount::getFixed(1);
7148        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7149     VFCandidates.push_back(VF);
7150   for (auto VF = ElementCount::getScalable(1);
7151        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7152     VFCandidates.push_back(VF);
7153 
7154   CM.collectInLoopReductions();
7155   for (const auto &VF : VFCandidates) {
7156     // Collect Uniform and Scalar instructions after vectorization with VF.
7157     CM.collectUniformsAndScalars(VF);
7158 
7159     // Collect the instructions (and their associated costs) that will be more
7160     // profitable to scalarize.
7161     if (VF.isVector())
7162       CM.collectInstsToScalarize(VF);
7163   }
7164 
7165   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7166   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7167 
7168   LLVM_DEBUG(printPlans(dbgs()));
7169 }
7170 
7171 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7172                                              ElementCount VF) const {
7173   if (ForceTargetInstructionCost.getNumOccurrences())
7174     return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7175   return CM.getInstructionCost(UI, VF);
7176 }
7177 
7178 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7179   return CM.ValuesToIgnore.contains(UI) ||
7180          (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7181          SkipCostComputation.contains(UI);
7182 }
7183 
7184 InstructionCost
7185 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7186                                           VPCostContext &CostCtx) const {
7187   InstructionCost Cost;
7188   // Cost modeling for inductions is inaccurate in the legacy cost model
7189   // compared to the recipes that are generated. To match here initially during
7190   // VPlan cost model bring up directly use the induction costs from the legacy
7191   // cost model. Note that we do this as pre-processing; the VPlan may not have
7192   // any recipes associated with the original induction increment instruction
7193   // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7194   // the cost of induction phis and increments (both that are represented by
7195   // recipes and those that are not), to avoid distinguishing between them here,
7196   // and skip all recipes that represent induction phis and increments (the
7197   // former case) later on, if they exist, to avoid counting them twice.
7198   // Similarly we pre-compute the cost of any optimized truncates.
7199   // TODO: Switch to more accurate costing based on VPlan.
7200   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7201     Instruction *IVInc = cast<Instruction>(
7202         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7203     SmallVector<Instruction *> IVInsts = {IVInc};
7204     for (unsigned I = 0; I != IVInsts.size(); I++) {
7205       for (Value *Op : IVInsts[I]->operands()) {
7206         auto *OpI = dyn_cast<Instruction>(Op);
7207         if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7208           continue;
7209         IVInsts.push_back(OpI);
7210       }
7211     }
7212     IVInsts.push_back(IV);
7213     for (User *U : IV->users()) {
7214       auto *CI = cast<Instruction>(U);
7215       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7216         continue;
7217       IVInsts.push_back(CI);
7218     }
7219 
7220     // If the vector loop gets executed exactly once with the given VF, ignore
7221     // the costs of comparison and induction instructions, as they'll get
7222     // simplified away.
7223     // TODO: Remove this code after stepping away from the legacy cost model and
7224     // adding code to simplify VPlans before calculating their costs.
7225     auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7226     if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7227       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
7228                                            CostCtx.SkipCostComputation);
7229 
7230     for (Instruction *IVInst : IVInsts) {
7231       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7232         continue;
7233       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7234       LLVM_DEBUG({
7235         dbgs() << "Cost of " << InductionCost << " for VF " << VF
7236                << ": induction instruction " << *IVInst << "\n";
7237       });
7238       Cost += InductionCost;
7239       CostCtx.SkipCostComputation.insert(IVInst);
7240     }
7241   }
7242 
7243   /// Compute the cost of all exiting conditions of the loop using the legacy
7244   /// cost model. This is to match the legacy behavior, which adds the cost of
7245   /// all exit conditions. Note that this over-estimates the cost, as there will
7246   /// be a single condition to control the vector loop.
7247   SmallVector<BasicBlock *> Exiting;
7248   CM.TheLoop->getExitingBlocks(Exiting);
7249   SetVector<Instruction *> ExitInstrs;
7250   // Collect all exit conditions.
7251   for (BasicBlock *EB : Exiting) {
7252     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7253     if (!Term)
7254       continue;
7255     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7256       ExitInstrs.insert(CondI);
7257     }
7258   }
7259   // Compute the cost of all instructions only feeding the exit conditions.
7260   for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7261     Instruction *CondI = ExitInstrs[I];
7262     if (!OrigLoop->contains(CondI) ||
7263         !CostCtx.SkipCostComputation.insert(CondI).second)
7264       continue;
7265     InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7266     LLVM_DEBUG({
7267       dbgs() << "Cost of " << CondICost << " for VF " << VF
7268              << ": exit condition instruction " << *CondI << "\n";
7269     });
7270     Cost += CondICost;
7271     for (Value *Op : CondI->operands()) {
7272       auto *OpI = dyn_cast<Instruction>(Op);
7273       if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7274             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7275                    !ExitInstrs.contains(cast<Instruction>(U));
7276           }))
7277         continue;
7278       ExitInstrs.insert(OpI);
7279     }
7280   }
7281 
7282   // The legacy cost model has special logic to compute the cost of in-loop
7283   // reductions, which may be smaller than the sum of all instructions involved
7284   // in the reduction.
7285   // TODO: Switch to costing based on VPlan once the logic has been ported.
7286   for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7287     if (ForceTargetInstructionCost.getNumOccurrences())
7288       continue;
7289 
7290     if (!CM.isInLoopReduction(RedPhi))
7291       continue;
7292 
7293     const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7294     SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7295                                                  ChainOps.end());
7296     auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7297       return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7298     };
7299     // Also include the operands of instructions in the chain, as the cost-model
7300     // may mark extends as free.
7301     //
7302     // For ARM, some of the instruction can folded into the reducion
7303     // instruction. So we need to mark all folded instructions free.
7304     // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7305     // instruction.
7306     for (auto *ChainOp : ChainOps) {
7307       for (Value *Op : ChainOp->operands()) {
7308         if (auto *I = dyn_cast<Instruction>(Op)) {
7309           ChainOpsAndOperands.insert(I);
7310           if (I->getOpcode() == Instruction::Mul) {
7311             auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7312             auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7313             if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7314                 Ext0->getOpcode() == Ext1->getOpcode()) {
7315               ChainOpsAndOperands.insert(Ext0);
7316               ChainOpsAndOperands.insert(Ext1);
7317             }
7318           }
7319         }
7320       }
7321     }
7322 
7323     // Pre-compute the cost for I, if it has a reduction pattern cost.
7324     for (Instruction *I : ChainOpsAndOperands) {
7325       auto ReductionCost =
7326           CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7327       if (!ReductionCost)
7328         continue;
7329 
7330       assert(!CostCtx.SkipCostComputation.contains(I) &&
7331              "reduction op visited multiple times");
7332       CostCtx.SkipCostComputation.insert(I);
7333       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7334                         << ":\n in-loop reduction " << *I << "\n");
7335       Cost += *ReductionCost;
7336     }
7337   }
7338 
7339   // Pre-compute the costs for branches except for the backedge, as the number
7340   // of replicate regions in a VPlan may not directly match the number of
7341   // branches, which would lead to different decisions.
7342   // TODO: Compute cost of branches for each replicate region in the VPlan,
7343   // which is more accurate than the legacy cost model.
7344   for (BasicBlock *BB : OrigLoop->blocks()) {
7345     if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7346       continue;
7347     CostCtx.SkipCostComputation.insert(BB->getTerminator());
7348     if (BB == OrigLoop->getLoopLatch())
7349       continue;
7350     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7351     Cost += BranchCost;
7352   }
7353 
7354   // Pre-compute costs for instructions that are forced-scalar or profitable to
7355   // scalarize. Their costs will be computed separately in the legacy cost
7356   // model.
7357   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7358     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7359       continue;
7360     CostCtx.SkipCostComputation.insert(ForcedScalar);
7361     InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7362     LLVM_DEBUG({
7363       dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7364              << ": forced scalar " << *ForcedScalar << "\n";
7365     });
7366     Cost += ForcedCost;
7367   }
7368   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7369     if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7370       continue;
7371     CostCtx.SkipCostComputation.insert(Scalarized);
7372     LLVM_DEBUG({
7373       dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7374              << ": profitable to scalarize " << *Scalarized << "\n";
7375     });
7376     Cost += ScalarCost;
7377   }
7378 
7379   return Cost;
7380 }
7381 
7382 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7383                                                ElementCount VF) const {
7384   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7385                         CM.CostKind);
7386   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7387 
7388   // Now compute and add the VPlan-based cost.
7389   Cost += Plan.cost(VF, CostCtx);
7390 #ifndef NDEBUG
7391   unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7392   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7393                     << " (Estimated cost per lane: ");
7394   if (Cost.isValid()) {
7395     double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7396     LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7397   } else /* No point dividing an invalid cost - it will still be invalid */
7398     LLVM_DEBUG(dbgs() << "Invalid");
7399   LLVM_DEBUG(dbgs() << ")\n");
7400 #endif
7401   return Cost;
7402 }
7403 
7404 #ifndef NDEBUG
7405 /// Return true if the original loop \ TheLoop contains any instructions that do
7406 /// not have corresponding recipes in \p Plan and are not marked to be ignored
7407 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
7408 /// cost-model did not account for.
7409 static bool planContainsAdditionalSimplifications(VPlan &Plan,
7410                                                   VPCostContext &CostCtx,
7411                                                   Loop *TheLoop) {
7412   // First collect all instructions for the recipes in Plan.
7413   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7414     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7415       return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7416     if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7417       return &WidenMem->getIngredient();
7418     return nullptr;
7419   };
7420 
7421   DenseSet<Instruction *> SeenInstrs;
7422   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7423   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7424     for (VPRecipeBase &R : *VPBB) {
7425       if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7426         auto *IG = IR->getInterleaveGroup();
7427         unsigned NumMembers = IG->getNumMembers();
7428         for (unsigned I = 0; I != NumMembers; ++I) {
7429           if (Instruction *M = IG->getMember(I))
7430             SeenInstrs.insert(M);
7431         }
7432         continue;
7433       }
7434       // The VPlan-based cost model is more accurate for partial reduction and
7435       // comparing against the legacy cost isn't desirable.
7436       if (isa<VPPartialReductionRecipe>(&R))
7437         return true;
7438       if (Instruction *UI = GetInstructionForCost(&R))
7439         SeenInstrs.insert(UI);
7440     }
7441   }
7442 
7443   // Return true if the loop contains any instructions that are not also part of
7444   // the VPlan or are skipped for VPlan-based cost computations. This indicates
7445   // that the VPlan contains extra simplifications.
7446   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7447                                     TheLoop](BasicBlock *BB) {
7448     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7449       if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7450         return false;
7451       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7452     });
7453   });
7454 }
7455 #endif
7456 
7457 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7458   if (VPlans.empty())
7459     return VectorizationFactor::Disabled();
7460   // If there is a single VPlan with a single VF, return it directly.
7461   VPlan &FirstPlan = *VPlans[0];
7462   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7463     return {*FirstPlan.vectorFactors().begin(), 0, 0};
7464 
7465   LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7466                     << (CM.CostKind == TTI::TCK_RecipThroughput
7467                             ? "Reciprocal Throughput\n"
7468                         : CM.CostKind == TTI::TCK_Latency
7469                             ? "Instruction Latency\n"
7470                         : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7471                         : CM.CostKind == TTI::TCK_SizeAndLatency
7472                             ? "Code Size and Latency\n"
7473                             : "Unknown\n"));
7474 
7475   ElementCount ScalarVF = ElementCount::getFixed(1);
7476   assert(hasPlanWithVF(ScalarVF) &&
7477          "More than a single plan/VF w/o any plan having scalar VF");
7478 
7479   // TODO: Compute scalar cost using VPlan-based cost model.
7480   InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7481   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7482   VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7483   VectorizationFactor BestFactor = ScalarFactor;
7484 
7485   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7486   if (ForceVectorization) {
7487     // Ignore scalar width, because the user explicitly wants vectorization.
7488     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7489     // evaluation.
7490     BestFactor.Cost = InstructionCost::getMax();
7491   }
7492 
7493   for (auto &P : VPlans) {
7494     for (ElementCount VF : P->vectorFactors()) {
7495       if (VF.isScalar())
7496         continue;
7497       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7498         LLVM_DEBUG(
7499             dbgs()
7500             << "LV: Not considering vector loop of width " << VF
7501             << " because it will not generate any vector instructions.\n");
7502         continue;
7503       }
7504 
7505       InstructionCost Cost = cost(*P, VF);
7506       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7507       if (isMoreProfitable(CurrentFactor, BestFactor))
7508         BestFactor = CurrentFactor;
7509 
7510       // If profitable add it to ProfitableVF list.
7511       if (isMoreProfitable(CurrentFactor, ScalarFactor))
7512         ProfitableVFs.push_back(CurrentFactor);
7513     }
7514   }
7515 
7516 #ifndef NDEBUG
7517   // Select the optimal vectorization factor according to the legacy cost-model.
7518   // This is now only used to verify the decisions by the new VPlan-based
7519   // cost-model and will be retired once the VPlan-based cost-model is
7520   // stabilized.
7521   VectorizationFactor LegacyVF = selectVectorizationFactor();
7522   VPlan &BestPlan = getPlanFor(BestFactor.Width);
7523 
7524   // Pre-compute the cost and use it to check if BestPlan contains any
7525   // simplifications not accounted for in the legacy cost model. If that's the
7526   // case, don't trigger the assertion, as the extra simplifications may cause a
7527   // different VF to be picked by the VPlan-based cost model.
7528   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7529                         CM.CostKind);
7530   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7531   assert((BestFactor.Width == LegacyVF.Width ||
7532           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7533                                                 CostCtx, OrigLoop) ||
7534           planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
7535                                                 CostCtx, OrigLoop)) &&
7536          " VPlan cost model and legacy cost model disagreed");
7537   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7538          "when vectorizing, the scalar cost must be computed.");
7539 #endif
7540 
7541   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7542   return BestFactor;
7543 }
7544 
7545 static void addRuntimeUnrollDisableMetaData(Loop *L) {
7546   SmallVector<Metadata *, 4> MDs;
7547   // Reserve first location for self reference to the LoopID metadata node.
7548   MDs.push_back(nullptr);
7549   bool IsUnrollMetadata = false;
7550   MDNode *LoopID = L->getLoopID();
7551   if (LoopID) {
7552     // First find existing loop unrolling disable metadata.
7553     for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7554       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7555       if (MD) {
7556         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7557         IsUnrollMetadata =
7558             S && S->getString().starts_with("llvm.loop.unroll.disable");
7559       }
7560       MDs.push_back(LoopID->getOperand(I));
7561     }
7562   }
7563 
7564   if (!IsUnrollMetadata) {
7565     // Add runtime unroll disable metadata.
7566     LLVMContext &Context = L->getHeader()->getContext();
7567     SmallVector<Metadata *, 1> DisableOperands;
7568     DisableOperands.push_back(
7569         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7570     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7571     MDs.push_back(DisableNode);
7572     MDNode *NewLoopID = MDNode::get(Context, MDs);
7573     // Set operand 0 to refer to the loop id itself.
7574     NewLoopID->replaceOperandWith(0, NewLoopID);
7575     L->setLoopID(NewLoopID);
7576   }
7577 }
7578 
7579 // If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7580 // fix the reduction's scalar PHI node by adding the incoming value from the
7581 // main vector loop.
7582 static void fixReductionScalarResumeWhenVectorizingEpilog(
7583     VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7584     BasicBlock *BypassBlock) {
7585   auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7586   if (!EpiRedResult ||
7587       EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7588     return;
7589 
7590   auto *EpiRedHeaderPhi =
7591       cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7592   const RecurrenceDescriptor &RdxDesc =
7593       EpiRedHeaderPhi->getRecurrenceDescriptor();
7594   Value *MainResumeValue =
7595       EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7596   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7597           RdxDesc.getRecurrenceKind())) {
7598     auto *Cmp = cast<ICmpInst>(MainResumeValue);
7599     assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7600            "AnyOf expected to start with ICMP_NE");
7601     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7602            "AnyOf expected to start by comparing main resume value to original "
7603            "start value");
7604     MainResumeValue = Cmp->getOperand(0);
7605   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
7606                  RdxDesc.getRecurrenceKind())) {
7607     using namespace llvm::PatternMatch;
7608     Value *Cmp, *OrigResumeV;
7609     bool IsExpectedPattern =
7610         match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7611                                         m_Specific(RdxDesc.getSentinelValue()),
7612                                         m_Value(OrigResumeV))) &&
7613         match(Cmp,
7614               m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7615                              m_Specific(RdxDesc.getRecurrenceStartValue())));
7616     assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7617     (void)IsExpectedPattern;
7618     MainResumeValue = OrigResumeV;
7619   }
7620   PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7621 
7622   // When fixing reductions in the epilogue loop we should already have
7623   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7624   // over the incoming values correctly.
7625   using namespace VPlanPatternMatch;
7626   auto IsResumePhi = [](VPUser *U) {
7627     return match(
7628         U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7629   };
7630   assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7631          "ResumePhi must have a single user");
7632   auto *EpiResumePhiVPI =
7633       cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7634   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7635   EpiResumePhi->setIncomingValueForBlock(
7636       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7637 }
7638 
7639 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7640     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7641     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7642     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7643   assert(BestVPlan.hasVF(BestVF) &&
7644          "Trying to execute plan with unsupported VF");
7645   assert(BestVPlan.hasUF(BestUF) &&
7646          "Trying to execute plan with unsupported UF");
7647   assert(
7648       ((VectorizingEpilogue && ExpandedSCEVs) ||
7649        (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7650       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7651 
7652   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7653   // cost model is complete for better cost estimates.
7654   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7655                               OrigLoop->getHeader()->getContext());
7656   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7657   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
7658 
7659   // Perform the actual loop transformation.
7660   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7661                          &BestVPlan, OrigLoop->getParentLoop(),
7662                          Legal->getWidestInductionType());
7663 
7664 #ifdef EXPENSIVE_CHECKS
7665   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7666 #endif
7667 
7668   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7669   // making any changes to the CFG.
7670   if (!BestVPlan.getEntry()->empty())
7671     BestVPlan.getEntry()->execute(&State);
7672 
7673   if (!ILV.getTripCount())
7674     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7675   else
7676     assert(VectorizingEpilogue && "should only re-use the existing trip "
7677                                   "count during epilogue vectorization");
7678 
7679   // 1. Set up the skeleton for vectorization, including vector pre-header and
7680   // middle block. The vector loop is created during VPlan execution.
7681   VPBasicBlock *VectorPH =
7682       cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7683   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7684       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7685   if (VectorizingEpilogue)
7686     VPlanTransforms::removeDeadRecipes(BestVPlan);
7687 
7688   // Only use noalias metadata when using memory checks guaranteeing no overlap
7689   // across all iterations.
7690   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7691   std::unique_ptr<LoopVersioning> LVer = nullptr;
7692   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7693       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7694 
7695     //  We currently don't use LoopVersioning for the actual loop cloning but we
7696     //  still use it to add the noalias metadata.
7697     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7698     //        metadata.
7699     LVer = std::make_unique<LoopVersioning>(
7700         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7701         PSE.getSE());
7702     State.LVer = &*LVer;
7703     State.LVer->prepareNoAliasMetadata();
7704   }
7705 
7706   ILV.printDebugTracesAtStart();
7707 
7708   //===------------------------------------------------===//
7709   //
7710   // Notice: any optimization or new instruction that go
7711   // into the code below should also be implemented in
7712   // the cost-model.
7713   //
7714   //===------------------------------------------------===//
7715 
7716   // 2. Copy and widen instructions from the old loop into the new loop.
7717   BestVPlan.prepareToExecute(
7718       ILV.getTripCount(),
7719       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7720   replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7721 
7722   BestVPlan.execute(&State);
7723 
7724   auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7725   // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7726   // values from the additional bypass block.
7727   if (VectorizingEpilogue) {
7728     assert(!ILV.Legal->hasUncountableEarlyExit() &&
7729            "Epilogue vectorisation not yet supported with early exits");
7730     BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7731     for (VPRecipeBase &R : *MiddleVPBB) {
7732       fixReductionScalarResumeWhenVectorizingEpilog(
7733           &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7734     }
7735     BasicBlock *PH = OrigLoop->getLoopPreheader();
7736     for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7737       auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7738       Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
7739       Inc->setIncomingValueForBlock(BypassBlock, V);
7740     }
7741   }
7742 
7743   // 2.6. Maintain Loop Hints
7744   // Keep all loop hints from the original loop on the vector loop (we'll
7745   // replace the vectorizer-specific hints below).
7746   if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7747     MDNode *OrigLoopID = OrigLoop->getLoopID();
7748 
7749     std::optional<MDNode *> VectorizedLoopID =
7750         makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7751                                         LLVMLoopVectorizeFollowupVectorized});
7752 
7753     VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7754     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7755     if (VectorizedLoopID) {
7756       L->setLoopID(*VectorizedLoopID);
7757     } else {
7758       // Keep all loop hints from the original loop on the vector loop (we'll
7759       // replace the vectorizer-specific hints below).
7760       if (MDNode *LID = OrigLoop->getLoopID())
7761         L->setLoopID(LID);
7762 
7763       LoopVectorizeHints Hints(L, true, *ORE);
7764       Hints.setAlreadyVectorized();
7765     }
7766     TargetTransformInfo::UnrollingPreferences UP;
7767     TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7768     if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7769       addRuntimeUnrollDisableMetaData(L);
7770   }
7771 
7772   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7773   //    predication, updating analyses.
7774   ILV.fixVectorizedLoop(State);
7775 
7776   ILV.printDebugTracesAtEnd();
7777 
7778   // 4. Adjust branch weight of the branch in the middle block.
7779   if (BestVPlan.getVectorLoopRegion()) {
7780     auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7781     auto *MiddleTerm =
7782         cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7783     if (MiddleTerm->isConditional() &&
7784         hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7785       // Assume that `Count % VectorTripCount` is equally distributed.
7786       unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7787       assert(TripCount > 0 && "trip count should not be zero");
7788       const uint32_t Weights[] = {1, TripCount - 1};
7789       setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7790     }
7791   }
7792 
7793   return State.ExpandedSCEVs;
7794 }
7795 
7796 //===--------------------------------------------------------------------===//
7797 // EpilogueVectorizerMainLoop
7798 //===--------------------------------------------------------------------===//
7799 
7800 /// This function is partially responsible for generating the control flow
7801 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7802 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7803     const SCEV2ValueTy &ExpandedSCEVs) {
7804   createVectorLoopSkeleton("");
7805 
7806   // Generate the code to check the minimum iteration count of the vector
7807   // epilogue (see below).
7808   EPI.EpilogueIterationCountCheck =
7809       emitIterationCountCheck(LoopScalarPreHeader, true);
7810   EPI.EpilogueIterationCountCheck->setName("iter.check");
7811 
7812   // Generate the code to check any assumptions that we've made for SCEV
7813   // expressions.
7814   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7815 
7816   // Generate the code that checks at runtime if arrays overlap. We put the
7817   // checks into a separate block to make the more common case of few elements
7818   // faster.
7819   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7820 
7821   // Generate the iteration count check for the main loop, *after* the check
7822   // for the epilogue loop, so that the path-length is shorter for the case
7823   // that goes directly through the vector epilogue. The longer-path length for
7824   // the main loop is compensated for, by the gain from vectorizing the larger
7825   // trip count. Note: the branch will get updated later on when we vectorize
7826   // the epilogue.
7827   EPI.MainLoopIterationCountCheck =
7828       emitIterationCountCheck(LoopScalarPreHeader, false);
7829 
7830   // Generate the induction variable.
7831   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7832 
7833   return LoopVectorPreHeader;
7834 }
7835 
7836 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7837   LLVM_DEBUG({
7838     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7839            << "Main Loop VF:" << EPI.MainLoopVF
7840            << ", Main Loop UF:" << EPI.MainLoopUF
7841            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7842            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7843   });
7844 }
7845 
7846 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7847   DEBUG_WITH_TYPE(VerboseDebug, {
7848     dbgs() << "intermediate fn:\n"
7849            << *OrigLoop->getHeader()->getParent() << "\n";
7850   });
7851 }
7852 
7853 BasicBlock *
7854 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7855                                                     bool ForEpilogue) {
7856   assert(Bypass && "Expected valid bypass basic block.");
7857   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7858   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7859   Value *Count = getTripCount();
7860   // Reuse existing vector loop preheader for TC checks.
7861   // Note that new preheader block is generated for vector loop.
7862   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7863   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7864 
7865   // Generate code to check if the loop's trip count is less than VF * UF of the
7866   // main vector loop.
7867   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7868                                                     : VF.isVector())
7869                ? ICmpInst::ICMP_ULE
7870                : ICmpInst::ICMP_ULT;
7871 
7872   Value *CheckMinIters = Builder.CreateICmp(
7873       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7874       "min.iters.check");
7875 
7876   if (!ForEpilogue)
7877     TCCheckBlock->setName("vector.main.loop.iter.check");
7878 
7879   // Create new preheader for vector loop.
7880   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7881                                    DT, LI, nullptr, "vector.ph");
7882 
7883   if (ForEpilogue) {
7884     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7885                                  DT->getNode(Bypass)->getIDom()) &&
7886            "TC check is expected to dominate Bypass");
7887 
7888     LoopBypassBlocks.push_back(TCCheckBlock);
7889 
7890     // Save the trip count so we don't have to regenerate it in the
7891     // vec.epilog.iter.check. This is safe to do because the trip count
7892     // generated here dominates the vector epilog iter check.
7893     EPI.TripCount = Count;
7894   }
7895 
7896   BranchInst &BI =
7897       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7898   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7899     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7900   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7901 
7902   introduceCheckBlockInVPlan(TCCheckBlock);
7903   return TCCheckBlock;
7904 }
7905 
7906 //===--------------------------------------------------------------------===//
7907 // EpilogueVectorizerEpilogueLoop
7908 //===--------------------------------------------------------------------===//
7909 
7910 /// This function is partially responsible for generating the control flow
7911 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7912 BasicBlock *
7913 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7914     const SCEV2ValueTy &ExpandedSCEVs) {
7915   createVectorLoopSkeleton("vec.epilog.");
7916 
7917   // Now, compare the remaining count and if there aren't enough iterations to
7918   // execute the vectorized epilogue skip to the scalar part.
7919   LoopVectorPreHeader->setName("vec.epilog.ph");
7920   BasicBlock *VecEpilogueIterationCountCheck =
7921       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
7922                  nullptr, "vec.epilog.iter.check", true);
7923   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7924                                           VecEpilogueIterationCountCheck);
7925   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7926 
7927   // Adjust the control flow taking the state info from the main loop
7928   // vectorization into account.
7929   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7930          "expected this to be saved from the previous pass.");
7931   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7932       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7933 
7934   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7935       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7936 
7937   if (EPI.SCEVSafetyCheck)
7938     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7939         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7940   if (EPI.MemSafetyCheck)
7941     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7942         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7943 
7944   DT->changeImmediateDominator(LoopScalarPreHeader,
7945                                EPI.EpilogueIterationCountCheck);
7946   // Keep track of bypass blocks, as they feed start values to the induction and
7947   // reduction phis in the scalar loop preheader.
7948   if (EPI.SCEVSafetyCheck)
7949     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7950   if (EPI.MemSafetyCheck)
7951     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7952   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7953 
7954   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7955   // reductions which merge control-flow from the latch block and the middle
7956   // block. Update the incoming values here and move the Phi into the preheader.
7957   SmallVector<PHINode *, 4> PhisInBlock;
7958   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7959     PhisInBlock.push_back(&Phi);
7960 
7961   for (PHINode *Phi : PhisInBlock) {
7962     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7963     Phi->replaceIncomingBlockWith(
7964         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7965         VecEpilogueIterationCountCheck);
7966 
7967     // If the phi doesn't have an incoming value from the
7968     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7969     // value and also those from other check blocks. This is needed for
7970     // reduction phis only.
7971     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7972           return EPI.EpilogueIterationCountCheck == IncB;
7973         }))
7974       continue;
7975     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7976     if (EPI.SCEVSafetyCheck)
7977       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7978     if (EPI.MemSafetyCheck)
7979       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7980   }
7981 
7982   // Generate bypass values from the additional bypass block. Note that when the
7983   // vectorized epilogue is skipped due to iteration count check, then the
7984   // resume value for the induction variable comes from the trip count of the
7985   // main vector loop, passed as the second argument.
7986   createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
7987   return LoopVectorPreHeader;
7988 }
7989 
7990 BasicBlock *
7991 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7992     BasicBlock *Bypass, BasicBlock *Insert) {
7993 
7994   assert(EPI.TripCount &&
7995          "Expected trip count to have been saved in the first pass.");
7996   assert(
7997       (!isa<Instruction>(EPI.TripCount) ||
7998        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7999       "saved trip count does not dominate insertion point.");
8000   Value *TC = EPI.TripCount;
8001   IRBuilder<> Builder(Insert->getTerminator());
8002   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8003 
8004   // Generate code to check if the loop's trip count is less than VF * UF of the
8005   // vector epilogue loop.
8006   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8007                ? ICmpInst::ICMP_ULE
8008                : ICmpInst::ICMP_ULT;
8009 
8010   Value *CheckMinIters =
8011       Builder.CreateICmp(P, Count,
8012                          createStepForVF(Builder, Count->getType(),
8013                                          EPI.EpilogueVF, EPI.EpilogueUF),
8014                          "min.epilog.iters.check");
8015 
8016   BranchInst &BI =
8017       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8018   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
8019     unsigned MainLoopStep = UF * VF.getKnownMinValue();
8020     unsigned EpilogueLoopStep =
8021         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
8022     // We assume the remaining `Count` is equally distributed in
8023     // [0, MainLoopStep)
8024     // So the probability for `Count < EpilogueLoopStep` should be
8025     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8026     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8027     const uint32_t Weights[] = {EstimatedSkipCount,
8028                                 MainLoopStep - EstimatedSkipCount};
8029     setBranchWeights(BI, Weights, /*IsExpected=*/false);
8030   }
8031   ReplaceInstWithInst(Insert->getTerminator(), &BI);
8032   LoopBypassBlocks.push_back(Insert);
8033 
8034   // A new entry block has been created for the epilogue VPlan. Hook it in, as
8035   // otherwise we would try to modify the entry to the main vector loop.
8036   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8037   VPBasicBlock *OldEntry = Plan.getEntry();
8038   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8039   Plan.setEntry(NewEntry);
8040   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8041 
8042   introduceCheckBlockInVPlan(Insert);
8043   return Insert;
8044 }
8045 
8046 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8047   LLVM_DEBUG({
8048     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8049            << "Epilogue Loop VF:" << EPI.EpilogueVF
8050            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8051   });
8052 }
8053 
8054 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8055   DEBUG_WITH_TYPE(VerboseDebug, {
8056     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8057   });
8058 }
8059 
8060 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8061 VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
8062   std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8063     return getVPValueOrAddLiveIn(Op);
8064   };
8065   return map_range(Operands, Fn);
8066 }
8067 
8068 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
8069   BasicBlock *Src = SI->getParent();
8070   assert(!OrigLoop->isLoopExiting(Src) &&
8071          all_of(successors(Src),
8072                 [this](BasicBlock *Succ) {
8073                   return OrigLoop->getHeader() != Succ;
8074                 }) &&
8075          "unsupported switch either exiting loop or continuing to header");
8076   // Create masks where the terminator in Src is a switch. We create mask for
8077   // all edges at the same time. This is more efficient, as we can create and
8078   // collect compares for all cases once.
8079   VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8080   BasicBlock *DefaultDst = SI->getDefaultDest();
8081   MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
8082   for (auto &C : SI->cases()) {
8083     BasicBlock *Dst = C.getCaseSuccessor();
8084     assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8085     // Cases whose destination is the same as default are redundant and can be
8086     // ignored - they will get there anyhow.
8087     if (Dst == DefaultDst)
8088       continue;
8089     auto &Compares = Dst2Compares[Dst];
8090     VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8091     Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8092   }
8093 
8094   // We need to handle 2 separate cases below for all entries in Dst2Compares,
8095   // which excludes destinations matching the default destination.
8096   VPValue *SrcMask = getBlockInMask(Src);
8097   VPValue *DefaultMask = nullptr;
8098   for (const auto &[Dst, Conds] : Dst2Compares) {
8099     // 1. Dst is not the default destination. Dst is reached if any of the cases
8100     // with destination == Dst are taken. Join the conditions for each case
8101     // whose destination == Dst using an OR.
8102     VPValue *Mask = Conds[0];
8103     for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8104       Mask = Builder.createOr(Mask, V);
8105     if (SrcMask)
8106       Mask = Builder.createLogicalAnd(SrcMask, Mask);
8107     EdgeMaskCache[{Src, Dst}] = Mask;
8108 
8109     // 2. Create the mask for the default destination, which is reached if none
8110     // of the cases with destination != default destination are taken. Join the
8111     // conditions for each case where the destination is != Dst using an OR and
8112     // negate it.
8113     DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8114   }
8115 
8116   if (DefaultMask) {
8117     DefaultMask = Builder.createNot(DefaultMask);
8118     if (SrcMask)
8119       DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8120   }
8121   EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8122 }
8123 
8124 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
8125   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8126 
8127   // Look for cached value.
8128   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8129   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8130   if (ECEntryIt != EdgeMaskCache.end())
8131     return ECEntryIt->second;
8132 
8133   if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8134     createSwitchEdgeMasks(SI);
8135     assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8136     return EdgeMaskCache[Edge];
8137   }
8138 
8139   VPValue *SrcMask = getBlockInMask(Src);
8140 
8141   // The terminator has to be a branch inst!
8142   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8143   assert(BI && "Unexpected terminator found");
8144   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8145     return EdgeMaskCache[Edge] = SrcMask;
8146 
8147   // If source is an exiting block, we know the exit edge is dynamically dead
8148   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8149   // adding uses of an otherwise potentially dead instruction unless we are
8150   // vectorizing a loop with uncountable exits. In that case, we always
8151   // materialize the mask.
8152   if (OrigLoop->isLoopExiting(Src) &&
8153       Src != Legal->getUncountableEarlyExitingBlock())
8154     return EdgeMaskCache[Edge] = SrcMask;
8155 
8156   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8157   assert(EdgeMask && "No Edge Mask found for condition");
8158 
8159   if (BI->getSuccessor(0) != Dst)
8160     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8161 
8162   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8163     // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8164     // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8165     // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8166     EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8167   }
8168 
8169   return EdgeMaskCache[Edge] = EdgeMask;
8170 }
8171 
8172 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
8173   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8174 
8175   // Look for cached value.
8176   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8177   EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8178   assert(ECEntryIt != EdgeMaskCache.end() &&
8179          "looking up mask for edge which has not been created");
8180   return ECEntryIt->second;
8181 }
8182 
8183 void VPRecipeBuilder::createHeaderMask() {
8184   BasicBlock *Header = OrigLoop->getHeader();
8185 
8186   // When not folding the tail, use nullptr to model all-true mask.
8187   if (!CM.foldTailByMasking()) {
8188     BlockMaskCache[Header] = nullptr;
8189     return;
8190   }
8191 
8192   // Introduce the early-exit compare IV <= BTC to form header block mask.
8193   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8194   // constructing the desired canonical IV in the header block as its first
8195   // non-phi instructions.
8196 
8197   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8198   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8199   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8200   HeaderVPBB->insert(IV, NewInsertionPoint);
8201 
8202   VPBuilder::InsertPointGuard Guard(Builder);
8203   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8204   VPValue *BlockMask = nullptr;
8205   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8206   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8207   BlockMaskCache[Header] = BlockMask;
8208 }
8209 
8210 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8211   // Return the cached value.
8212   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8213   assert(BCEntryIt != BlockMaskCache.end() &&
8214          "Trying to access mask for block without one.");
8215   return BCEntryIt->second;
8216 }
8217 
8218 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
8219   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8220   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8221   assert(OrigLoop->getHeader() != BB &&
8222          "Loop header must have cached block mask");
8223 
8224   // All-one mask is modelled as no-mask following the convention for masked
8225   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8226   VPValue *BlockMask = nullptr;
8227   // This is the block mask. We OR all unique incoming edges.
8228   for (auto *Predecessor :
8229        SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) {
8230     VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8231     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8232       BlockMaskCache[BB] = EdgeMask;
8233       return;
8234     }
8235 
8236     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8237       BlockMask = EdgeMask;
8238       continue;
8239     }
8240 
8241     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8242   }
8243 
8244   BlockMaskCache[BB] = BlockMask;
8245 }
8246 
8247 VPWidenMemoryRecipe *
8248 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8249                                   VFRange &Range) {
8250   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8251          "Must be called with either a load or store");
8252 
8253   auto WillWiden = [&](ElementCount VF) -> bool {
8254     LoopVectorizationCostModel::InstWidening Decision =
8255         CM.getWideningDecision(I, VF);
8256     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8257            "CM decision should be taken at this point.");
8258     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8259       return true;
8260     if (CM.isScalarAfterVectorization(I, VF) ||
8261         CM.isProfitableToScalarize(I, VF))
8262       return false;
8263     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8264   };
8265 
8266   if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
8267     return nullptr;
8268 
8269   VPValue *Mask = nullptr;
8270   if (Legal->isMaskRequired(I))
8271     Mask = getBlockInMask(I->getParent());
8272 
8273   // Determine if the pointer operand of the access is either consecutive or
8274   // reverse consecutive.
8275   LoopVectorizationCostModel::InstWidening Decision =
8276       CM.getWideningDecision(I, Range.Start);
8277   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8278   bool Consecutive =
8279       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8280 
8281   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8282   if (Consecutive) {
8283     auto *GEP = dyn_cast<GetElementPtrInst>(
8284         Ptr->getUnderlyingValue()->stripPointerCasts());
8285     VPSingleDefRecipe *VectorPtr;
8286     if (Reverse) {
8287       // When folding the tail, we may compute an address that we don't in the
8288       // original scalar loop and it may not be inbounds. Drop Inbounds in that
8289       // case.
8290       GEPNoWrapFlags Flags =
8291           (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8292               ? GEPNoWrapFlags::none()
8293               : GEPNoWrapFlags::inBounds();
8294       VectorPtr = new VPReverseVectorPointerRecipe(
8295           Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8296     } else {
8297       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8298                                             GEP ? GEP->getNoWrapFlags()
8299                                                 : GEPNoWrapFlags::none(),
8300                                             I->getDebugLoc());
8301     }
8302     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8303     Ptr = VectorPtr;
8304   }
8305   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8306     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8307                                  I->getDebugLoc());
8308 
8309   StoreInst *Store = cast<StoreInst>(I);
8310   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8311                                 Reverse, I->getDebugLoc());
8312 }
8313 
8314 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8315 /// insert a recipe to expand the step for the induction recipe.
8316 static VPWidenIntOrFpInductionRecipe *
8317 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8318                             VPValue *Start, const InductionDescriptor &IndDesc,
8319                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8320   assert(IndDesc.getStartValue() ==
8321          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8322   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8323          "step must be loop invariant");
8324 
8325   VPValue *Step =
8326       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8327   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8328     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8329                                              IndDesc, TruncI,
8330                                              TruncI->getDebugLoc());
8331   }
8332   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8333   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8334                                            IndDesc, Phi->getDebugLoc());
8335 }
8336 
8337 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8338     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
8339 
8340   // Check if this is an integer or fp induction. If so, build the recipe that
8341   // produces its scalar and vector values.
8342   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8343     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8344                                        *PSE.getSE(), *OrigLoop);
8345 
8346   // Check if this is pointer induction. If so, build the recipe for it.
8347   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8348     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8349                                                            *PSE.getSE());
8350     return new VPWidenPointerInductionRecipe(
8351         Phi, Operands[0], Step, *II,
8352         LoopVectorizationPlanner::getDecisionAndClampRange(
8353             [&](ElementCount VF) {
8354               return CM.isScalarAfterVectorization(Phi, VF);
8355             },
8356             Range),
8357         Phi->getDebugLoc());
8358   }
8359   return nullptr;
8360 }
8361 
8362 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8363     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
8364   // Optimize the special case where the source is a constant integer
8365   // induction variable. Notice that we can only optimize the 'trunc' case
8366   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8367   // (c) other casts depend on pointer size.
8368 
8369   // Determine whether \p K is a truncation based on an induction variable that
8370   // can be optimized.
8371   auto IsOptimizableIVTruncate =
8372       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8373     return [=](ElementCount VF) -> bool {
8374       return CM.isOptimizableIVTruncate(K, VF);
8375     };
8376   };
8377 
8378   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8379           IsOptimizableIVTruncate(I), Range)) {
8380 
8381     auto *Phi = cast<PHINode>(I->getOperand(0));
8382     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8383     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8384     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8385                                        *OrigLoop);
8386   }
8387   return nullptr;
8388 }
8389 
8390 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8391                                            ArrayRef<VPValue *> Operands) {
8392   unsigned NumIncoming = Phi->getNumIncomingValues();
8393 
8394   // We know that all PHIs in non-header blocks are converted into selects, so
8395   // we don't have to worry about the insertion order and we can just use the
8396   // builder. At this point we generate the predication tree. There may be
8397   // duplications since this is a simple recursive scan, but future
8398   // optimizations will clean it up.
8399   SmallVector<VPValue *, 2> OperandsWithMask;
8400 
8401   for (unsigned In = 0; In < NumIncoming; In++) {
8402     OperandsWithMask.push_back(Operands[In]);
8403     VPValue *EdgeMask =
8404         getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8405     if (!EdgeMask) {
8406       assert(In == 0 && "Both null and non-null edge masks found");
8407       assert(all_equal(Operands) &&
8408              "Distinct incoming values with one having a full mask");
8409       break;
8410     }
8411     OperandsWithMask.push_back(EdgeMask);
8412   }
8413   return new VPBlendRecipe(Phi, OperandsWithMask);
8414 }
8415 
8416 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8417                                                    ArrayRef<VPValue *> Operands,
8418                                                    VFRange &Range) {
8419   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8420       [this, CI](ElementCount VF) {
8421         return CM.isScalarWithPredication(CI, VF);
8422       },
8423       Range);
8424 
8425   if (IsPredicated)
8426     return nullptr;
8427 
8428   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8429   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8430              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8431              ID == Intrinsic::pseudoprobe ||
8432              ID == Intrinsic::experimental_noalias_scope_decl))
8433     return nullptr;
8434 
8435   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8436 
8437   // Is it beneficial to perform intrinsic call compared to lib call?
8438   bool ShouldUseVectorIntrinsic =
8439       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8440                 [&](ElementCount VF) -> bool {
8441                   return CM.getCallWideningDecision(CI, VF).Kind ==
8442                          LoopVectorizationCostModel::CM_IntrinsicCall;
8443                 },
8444                 Range);
8445   if (ShouldUseVectorIntrinsic)
8446     return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8447                                       CI->getDebugLoc());
8448 
8449   Function *Variant = nullptr;
8450   std::optional<unsigned> MaskPos;
8451   // Is better to call a vectorized version of the function than to to scalarize
8452   // the call?
8453   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8454       [&](ElementCount VF) -> bool {
8455         // The following case may be scalarized depending on the VF.
8456         // The flag shows whether we can use a usual Call for vectorized
8457         // version of the instruction.
8458 
8459         // If we've found a variant at a previous VF, then stop looking. A
8460         // vectorized variant of a function expects input in a certain shape
8461         // -- basically the number of input registers, the number of lanes
8462         // per register, and whether there's a mask required.
8463         // We store a pointer to the variant in the VPWidenCallRecipe, so
8464         // once we have an appropriate variant it's only valid for that VF.
8465         // This will force a different vplan to be generated for each VF that
8466         // finds a valid variant.
8467         if (Variant)
8468           return false;
8469         LoopVectorizationCostModel::CallWideningDecision Decision =
8470             CM.getCallWideningDecision(CI, VF);
8471         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8472           Variant = Decision.Variant;
8473           MaskPos = Decision.MaskPos;
8474           return true;
8475         }
8476 
8477         return false;
8478       },
8479       Range);
8480   if (ShouldUseVectorCall) {
8481     if (MaskPos.has_value()) {
8482       // We have 2 cases that would require a mask:
8483       //   1) The block needs to be predicated, either due to a conditional
8484       //      in the scalar loop or use of an active lane mask with
8485       //      tail-folding, and we use the appropriate mask for the block.
8486       //   2) No mask is required for the block, but the only available
8487       //      vector variant at this VF requires a mask, so we synthesize an
8488       //      all-true mask.
8489       VPValue *Mask = nullptr;
8490       if (Legal->isMaskRequired(CI))
8491         Mask = getBlockInMask(CI->getParent());
8492       else
8493         Mask = Plan.getOrAddLiveIn(
8494             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
8495 
8496       Ops.insert(Ops.begin() + *MaskPos, Mask);
8497     }
8498 
8499     Ops.push_back(Operands.back());
8500     return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8501   }
8502 
8503   return nullptr;
8504 }
8505 
8506 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8507   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8508          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8509   // Instruction should be widened, unless it is scalar after vectorization,
8510   // scalarization is profitable or it is predicated.
8511   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8512     return CM.isScalarAfterVectorization(I, VF) ||
8513            CM.isProfitableToScalarize(I, VF) ||
8514            CM.isScalarWithPredication(I, VF);
8515   };
8516   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8517                                                              Range);
8518 }
8519 
8520 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8521                                            ArrayRef<VPValue *> Operands,
8522                                            VPBasicBlock *VPBB) {
8523   switch (I->getOpcode()) {
8524   default:
8525     return nullptr;
8526   case Instruction::SDiv:
8527   case Instruction::UDiv:
8528   case Instruction::SRem:
8529   case Instruction::URem: {
8530     // If not provably safe, use a select to form a safe divisor before widening the
8531     // div/rem operation itself.  Otherwise fall through to general handling below.
8532     if (CM.isPredicatedInst(I)) {
8533       SmallVector<VPValue *> Ops(Operands);
8534       VPValue *Mask = getBlockInMask(I->getParent());
8535       VPValue *One =
8536           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8537       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8538       Ops[1] = SafeRHS;
8539       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8540     }
8541     [[fallthrough]];
8542   }
8543   case Instruction::Add:
8544   case Instruction::And:
8545   case Instruction::AShr:
8546   case Instruction::FAdd:
8547   case Instruction::FCmp:
8548   case Instruction::FDiv:
8549   case Instruction::FMul:
8550   case Instruction::FNeg:
8551   case Instruction::FRem:
8552   case Instruction::FSub:
8553   case Instruction::ICmp:
8554   case Instruction::LShr:
8555   case Instruction::Mul:
8556   case Instruction::Or:
8557   case Instruction::Select:
8558   case Instruction::Shl:
8559   case Instruction::Sub:
8560   case Instruction::Xor:
8561   case Instruction::Freeze:
8562     SmallVector<VPValue *> NewOps(Operands);
8563     if (Instruction::isBinaryOp(I->getOpcode())) {
8564       // The legacy cost model uses SCEV to check if some of the operands are
8565       // constants. To match the legacy cost model's behavior, use SCEV to try
8566       // to replace operands with constants.
8567       ScalarEvolution &SE = *PSE.getSE();
8568       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8569         Value *V = Op->getUnderlyingValue();
8570         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8571           return Op;
8572         auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8573         if (!C)
8574           return Op;
8575         return Plan.getOrAddLiveIn(C->getValue());
8576       };
8577       // For Mul, the legacy cost model checks both operands.
8578       if (I->getOpcode() == Instruction::Mul)
8579         NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8580       // For other binops, the legacy cost model only checks the second operand.
8581       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8582     }
8583     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8584   };
8585 }
8586 
8587 VPHistogramRecipe *
8588 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8589                                      ArrayRef<VPValue *> Operands) {
8590   // FIXME: Support other operations.
8591   unsigned Opcode = HI->Update->getOpcode();
8592   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8593          "Histogram update operation must be an Add or Sub");
8594 
8595   SmallVector<VPValue *, 3> HGramOps;
8596   // Bucket address.
8597   HGramOps.push_back(Operands[1]);
8598   // Increment value.
8599   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8600 
8601   // In case of predicated execution (due to tail-folding, or conditional
8602   // execution, or both), pass the relevant mask.
8603   if (Legal->isMaskRequired(HI->Store))
8604     HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8605 
8606   return new VPHistogramRecipe(Opcode,
8607                                make_range(HGramOps.begin(), HGramOps.end()),
8608                                HI->Store->getDebugLoc());
8609 }
8610 
8611 void VPRecipeBuilder::fixHeaderPhis() {
8612   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8613   for (VPHeaderPHIRecipe *R : PhisToFix) {
8614     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8615     VPRecipeBase *IncR =
8616         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8617     R->addOperand(IncR->getVPSingleValue());
8618   }
8619 }
8620 
8621 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8622                                                       VFRange &Range) {
8623   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8624       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8625       Range);
8626 
8627   bool IsPredicated = CM.isPredicatedInst(I);
8628 
8629   // Even if the instruction is not marked as uniform, there are certain
8630   // intrinsic calls that can be effectively treated as such, so we check for
8631   // them here. Conservatively, we only do this for scalable vectors, since
8632   // for fixed-width VFs we can always fall back on full scalarization.
8633   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8634     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8635     case Intrinsic::assume:
8636     case Intrinsic::lifetime_start:
8637     case Intrinsic::lifetime_end:
8638       // For scalable vectors if one of the operands is variant then we still
8639       // want to mark as uniform, which will generate one instruction for just
8640       // the first lane of the vector. We can't scalarize the call in the same
8641       // way as for fixed-width vectors because we don't know how many lanes
8642       // there are.
8643       //
8644       // The reasons for doing it this way for scalable vectors are:
8645       //   1. For the assume intrinsic generating the instruction for the first
8646       //      lane is still be better than not generating any at all. For
8647       //      example, the input may be a splat across all lanes.
8648       //   2. For the lifetime start/end intrinsics the pointer operand only
8649       //      does anything useful when the input comes from a stack object,
8650       //      which suggests it should always be uniform. For non-stack objects
8651       //      the effect is to poison the object, which still allows us to
8652       //      remove the call.
8653       IsUniform = true;
8654       break;
8655     default:
8656       break;
8657     }
8658   }
8659   VPValue *BlockInMask = nullptr;
8660   if (!IsPredicated) {
8661     // Finalize the recipe for Instr, first if it is not predicated.
8662     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8663   } else {
8664     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8665     // Instructions marked for predication are replicated and a mask operand is
8666     // added initially. Masked replicate recipes will later be placed under an
8667     // if-then construct to prevent side-effects. Generate recipes to compute
8668     // the block mask for this region.
8669     BlockInMask = getBlockInMask(I->getParent());
8670   }
8671 
8672   // Note that there is some custom logic to mark some intrinsics as uniform
8673   // manually above for scalable vectors, which this assert needs to account for
8674   // as well.
8675   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8676           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8677          "Should not predicate a uniform recipe");
8678   auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8679                                        IsUniform, BlockInMask);
8680   return Recipe;
8681 }
8682 
8683 /// Find all possible partial reductions in the loop and track all of those that
8684 /// are valid so recipes can be formed later.
8685 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8686   // Find all possible partial reductions.
8687   SmallVector<std::pair<PartialReductionChain, unsigned>>
8688       PartialReductionChains;
8689   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8690     getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
8691                         PartialReductionChains);
8692   }
8693 
8694   // A partial reduction is invalid if any of its extends are used by
8695   // something that isn't another partial reduction. This is because the
8696   // extends are intended to be lowered along with the reduction itself.
8697 
8698   // Build up a set of partial reduction bin ops for efficient use checking.
8699   SmallSet<User *, 4> PartialReductionBinOps;
8700   for (const auto &[PartialRdx, _] : PartialReductionChains)
8701     PartialReductionBinOps.insert(PartialRdx.BinOp);
8702 
8703   auto ExtendIsOnlyUsedByPartialReductions =
8704       [&PartialReductionBinOps](Instruction *Extend) {
8705         return all_of(Extend->users(), [&](const User *U) {
8706           return PartialReductionBinOps.contains(U);
8707         });
8708       };
8709 
8710   // Check if each use of a chain's two extends is a partial reduction
8711   // and only add those that don't have non-partial reduction users.
8712   for (auto Pair : PartialReductionChains) {
8713     PartialReductionChain Chain = Pair.first;
8714     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8715         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8716       ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));
8717   }
8718 }
8719 
8720 bool VPRecipeBuilder::getScaledReductions(
8721     Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8722     SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8723 
8724   if (!CM.TheLoop->contains(RdxExitInstr))
8725     return false;
8726 
8727   // TODO: Allow scaling reductions when predicating. The select at
8728   // the end of the loop chooses between the phi value and most recent
8729   // reduction result, both of which have different VFs to the active lane
8730   // mask when scaling.
8731   if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent()))
8732     return false;
8733 
8734   auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8735   if (!Update)
8736     return false;
8737 
8738   Value *Op = Update->getOperand(0);
8739   Value *PhiOp = Update->getOperand(1);
8740   if (Op == PHI)
8741     std::swap(Op, PhiOp);
8742 
8743   // Try and get a scaled reduction from the first non-phi operand.
8744   // If one is found, we use the discovered reduction instruction in
8745   // place of the accumulator for costing.
8746   if (auto *OpInst = dyn_cast<Instruction>(Op)) {
8747     if (getScaledReductions(PHI, OpInst, Range, Chains)) {
8748       PHI = Chains.rbegin()->first.Reduction;
8749 
8750       Op = Update->getOperand(0);
8751       PhiOp = Update->getOperand(1);
8752       if (Op == PHI)
8753         std::swap(Op, PhiOp);
8754     }
8755   }
8756   if (PhiOp != PHI)
8757     return false;
8758 
8759   auto *BinOp = dyn_cast<BinaryOperator>(Op);
8760   if (!BinOp || !BinOp->hasOneUse())
8761     return false;
8762 
8763   using namespace llvm::PatternMatch;
8764   Value *A, *B;
8765   if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8766       !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8767     return false;
8768 
8769   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8770   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8771 
8772   TTI::PartialReductionExtendKind OpAExtend =
8773       TargetTransformInfo::getPartialReductionExtendKind(ExtA);
8774   TTI::PartialReductionExtendKind OpBExtend =
8775       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
8776 
8777   PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);
8778 
8779   unsigned TargetScaleFactor =
8780       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8781           A->getType()->getPrimitiveSizeInBits());
8782 
8783   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8784           [&](ElementCount VF) {
8785             InstructionCost Cost = TTI->getPartialReductionCost(
8786                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8787                 VF, OpAExtend, OpBExtend,
8788                 std::make_optional(BinOp->getOpcode()));
8789             return Cost.isValid();
8790           },
8791           Range)) {
8792     Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
8793     return true;
8794   }
8795 
8796   return false;
8797 }
8798 
8799 VPRecipeBase *
8800 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8801                                         ArrayRef<VPValue *> Operands,
8802                                         VFRange &Range, VPBasicBlock *VPBB) {
8803   // First, check for specific widening recipes that deal with inductions, Phi
8804   // nodes, calls and memory operations.
8805   VPRecipeBase *Recipe;
8806   if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8807     if (Phi->getParent() != OrigLoop->getHeader())
8808       return tryToBlend(Phi, Operands);
8809 
8810     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8811       return Recipe;
8812 
8813     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8814     assert((Legal->isReductionVariable(Phi) ||
8815             Legal->isFixedOrderRecurrence(Phi)) &&
8816            "can only widen reductions and fixed-order recurrences here");
8817     VPValue *StartV = Operands[0];
8818     if (Legal->isReductionVariable(Phi)) {
8819       const RecurrenceDescriptor &RdxDesc =
8820           Legal->getReductionVars().find(Phi)->second;
8821       assert(RdxDesc.getRecurrenceStartValue() ==
8822              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8823 
8824       // If the PHI is used by a partial reduction, set the scale factor.
8825       unsigned ScaleFactor =
8826           getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8827       PhiRecipe = new VPReductionPHIRecipe(
8828           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8829           CM.useOrderedReductions(RdxDesc), ScaleFactor);
8830     } else {
8831       // TODO: Currently fixed-order recurrences are modeled as chains of
8832       // first-order recurrences. If there are no users of the intermediate
8833       // recurrences in the chain, the fixed order recurrence should be modeled
8834       // directly, enabling more efficient codegen.
8835       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8836     }
8837 
8838     PhisToFix.push_back(PhiRecipe);
8839     return PhiRecipe;
8840   }
8841 
8842   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8843                                     cast<TruncInst>(Instr), Operands, Range)))
8844     return Recipe;
8845 
8846   // All widen recipes below deal only with VF > 1.
8847   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8848           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8849     return nullptr;
8850 
8851   if (auto *CI = dyn_cast<CallInst>(Instr))
8852     return tryToWidenCall(CI, Operands, Range);
8853 
8854   if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8855     if (auto HistInfo = Legal->getHistogramInfo(SI))
8856       return tryToWidenHistogram(*HistInfo, Operands);
8857 
8858   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8859     return tryToWidenMemory(Instr, Operands, Range);
8860 
8861   if (getScalingForReduction(Instr))
8862     return tryToCreatePartialReduction(Instr, Operands);
8863 
8864   if (!shouldWiden(Instr, Range))
8865     return nullptr;
8866 
8867   if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8868     return new VPWidenGEPRecipe(GEP,
8869                                 make_range(Operands.begin(), Operands.end()));
8870 
8871   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8872     return new VPWidenSelectRecipe(
8873         *SI, make_range(Operands.begin(), Operands.end()));
8874   }
8875 
8876   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8877     return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8878                                  *CI);
8879   }
8880 
8881   return tryToWiden(Instr, Operands, VPBB);
8882 }
8883 
8884 VPRecipeBase *
8885 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8886                                              ArrayRef<VPValue *> Operands) {
8887   assert(Operands.size() == 2 &&
8888          "Unexpected number of operands for partial reduction");
8889 
8890   VPValue *BinOp = Operands[0];
8891   VPValue *Accumulator = Operands[1];
8892   VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8893   if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8894       isa<VPPartialReductionRecipe>(BinOpRecipe))
8895     std::swap(BinOp, Accumulator);
8896 
8897   return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp,
8898                                       Accumulator, Reduction);
8899 }
8900 
8901 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8902                                                         ElementCount MaxVF) {
8903   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8904 
8905   auto MaxVFTimes2 = MaxVF * 2;
8906   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8907     VFRange SubRange = {VF, MaxVFTimes2};
8908     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8909       // Now optimize the initial VPlan.
8910       if (!Plan->hasVF(ElementCount::getFixed(1)))
8911         VPlanTransforms::truncateToMinimalBitwidths(*Plan,
8912                                                     CM.getMinimalBitwidths());
8913       VPlanTransforms::optimize(*Plan);
8914       // TODO: try to put it close to addActiveLaneMask().
8915       // Discard the plan if it is not EVL-compatible
8916       if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
8917                                       *Plan, CM.getMaxSafeElements()))
8918         break;
8919       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8920       VPlans.push_back(std::move(Plan));
8921     }
8922     VF = SubRange.End;
8923   }
8924 }
8925 
8926 // Add the necessary canonical IV and branch recipes required to control the
8927 // loop.
8928 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8929                                   DebugLoc DL) {
8930   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8931   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8932 
8933   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8934   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8935   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8936   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8937   Header->insert(CanonicalIVPHI, Header->begin());
8938 
8939   VPBuilder Builder(TopRegion->getExitingBasicBlock());
8940   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8941   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8942       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8943       "index.next");
8944   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8945 
8946   // Add the BranchOnCount VPInstruction to the latch.
8947   Builder.createNaryOp(VPInstruction::BranchOnCount,
8948                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8949 }
8950 
8951 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8952 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8953 /// the end value of the induction.
8954 static VPInstruction *addResumePhiRecipeForInduction(
8955     VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8956     VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8957   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8958   // Truncated wide inductions resume from the last lane of their vector value
8959   // in the last vector iteration which is handled elsewhere.
8960   if (WideIntOrFp && WideIntOrFp->getTruncInst())
8961     return nullptr;
8962 
8963   VPValue *Start = WideIV->getStartValue();
8964   VPValue *Step = WideIV->getStepValue();
8965   const InductionDescriptor &ID = WideIV->getInductionDescriptor();
8966   VPValue *EndValue = VectorTC;
8967   if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8968     EndValue = VectorPHBuilder.createDerivedIV(
8969         ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
8970         Start, VectorTC, Step);
8971   }
8972 
8973   // EndValue is derived from the vector trip count (which has the same type as
8974   // the widest induction) and thus may be wider than the induction here.
8975   Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
8976   if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
8977     EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
8978                                                 ScalarTypeOfWideIV,
8979                                                 WideIV->getDebugLoc());
8980   }
8981 
8982   auto *ResumePhiRecipe =
8983       ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
8984                                    WideIV->getDebugLoc(), "bc.resume.val");
8985   return ResumePhiRecipe;
8986 }
8987 
8988 /// Create resume phis in the scalar preheader for first-order recurrences,
8989 /// reductions and inductions, and update the VPIRInstructions wrapping the
8990 /// original phis in the scalar header. End values for inductions are added to
8991 /// \p IVEndValues.
8992 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8993                                 DenseMap<VPValue *, VPValue *> &IVEndValues) {
8994   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8995   auto *ScalarPH = Plan.getScalarPreheader();
8996   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
8997   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8998   VPBuilder VectorPHBuilder(
8999       cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
9000   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9001   VPBuilder ScalarPHBuilder(ScalarPH);
9002   VPValue *OneVPV = Plan.getOrAddLiveIn(
9003       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
9004   for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
9005     auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9006     auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9007     if (!ScalarPhiI)
9008       break;
9009 
9010     // TODO: Extract final value from induction recipe initially, optimize to
9011     // pre-computed end value together in optimizeInductionExitUsers.
9012     auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9013     if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9014       if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
9015               WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9016               &Plan.getVectorTripCount())) {
9017         assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
9018                "Expected a ResumePhi");
9019         IVEndValues[WideIVR] = ResumePhi->getOperand(0);
9020         ScalarPhiIRI->addOperand(ResumePhi);
9021         continue;
9022       }
9023       // TODO: Also handle truncated inductions here. Computing end-values
9024       // separately should be done as VPlan-to-VPlan optimization, after
9025       // legalizing all resume values to use the last lane from the loop.
9026       assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9027              "should only skip truncated wide inductions");
9028       continue;
9029     }
9030 
9031     // The backedge value provides the value to resume coming out of a loop,
9032     // which for FORs is a vector whose last element needs to be extracted. The
9033     // start value provides the value if the loop is bypassed.
9034     bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9035     auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9036     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9037            "Cannot handle loops with uncountable early exits");
9038     if (IsFOR)
9039       ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9040           VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9041           "vector.recur.extract");
9042     StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9043     auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9044         VPInstruction::ResumePhi,
9045         {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9046     ScalarPhiIRI->addOperand(ResumePhiR);
9047   }
9048 }
9049 
9050 // Collect VPIRInstructions for phis in the exit blocks that are modeled
9051 // in VPlan and add the exiting VPValue as operand.
9052 static SetVector<VPIRInstruction *>
9053 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
9054                          VPlan &Plan) {
9055   SetVector<VPIRInstruction *> ExitUsersToFix;
9056   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9057     for (VPRecipeBase &R : *ExitVPBB) {
9058       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9059       if (!ExitIRI)
9060         continue;
9061       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9062       if (!ExitPhi)
9063         break;
9064       if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock()) {
9065         assert(ExitIRI->getNumOperands() ==
9066                    ExitVPBB->getPredecessors().size() &&
9067                "early-exit must update exit values on construction");
9068         continue;
9069       }
9070       BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9071       Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9072       VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9073       ExitIRI->addOperand(V);
9074       if (V->isLiveIn())
9075         continue;
9076       assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
9077              "Only recipes defined inside a region should need fixing.");
9078       ExitUsersToFix.insert(ExitIRI);
9079     }
9080   }
9081   return ExitUsersToFix;
9082 }
9083 
9084 // Add exit values to \p Plan. Extracts are added for each entry in \p
9085 // ExitUsersToFix if needed and their operands are updated.
9086 static void
9087 addUsersInExitBlocks(VPlan &Plan,
9088                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9089   if (ExitUsersToFix.empty())
9090     return;
9091 
9092   auto *MiddleVPBB = Plan.getMiddleBlock();
9093   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9094 
9095   // Introduce extract for exiting values and update the VPIRInstructions
9096   // modeling the corresponding LCSSA phis.
9097   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9098     assert(ExitIRI->getNumOperands() == 1 &&
9099            ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
9100            "exit values from early exits must be fixed when branch to "
9101            "early-exit is added");
9102     ExitIRI->extractLastLaneOfOperand(B);
9103   }
9104 }
9105 
9106 /// Handle users in the exit block for first order reductions in the original
9107 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9108 /// users in the original exit block using the VPIRInstruction wrapping to the
9109 /// LCSSA phi.
9110 static void addExitUsersForFirstOrderRecurrences(
9111     VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9112   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9113   auto *ScalarPHVPBB = Plan.getScalarPreheader();
9114   auto *MiddleVPBB = Plan.getMiddleBlock();
9115   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9116   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9117   VPValue *TwoVPV = Plan.getOrAddLiveIn(
9118       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9119 
9120   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9121     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9122     if (!FOR)
9123       continue;
9124 
9125     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9126            "Cannot handle loops with uncountable early exits");
9127 
9128     // This is the second phase of vectorizing first-order recurrences, creating
9129     // extract for users outside the loop. An overview of the transformation is
9130     // described below. Suppose we have the following loop with some use after
9131     // the loop of the last a[i-1],
9132     //
9133     //   for (int i = 0; i < n; ++i) {
9134     //     t = a[i - 1];
9135     //     b[i] = a[i] - t;
9136     //   }
9137     //   use t;
9138     //
9139     // There is a first-order recurrence on "a". For this loop, the shorthand
9140     // scalar IR looks like:
9141     //
9142     //   scalar.ph:
9143     //     s.init = a[-1]
9144     //     br scalar.body
9145     //
9146     //   scalar.body:
9147     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9148     //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9149     //     s2 = a[i]
9150     //     b[i] = s2 - s1
9151     //     br cond, scalar.body, exit.block
9152     //
9153     //   exit.block:
9154     //     use = lcssa.phi [s1, scalar.body]
9155     //
9156     // In this example, s1 is a recurrence because it's value depends on the
9157     // previous iteration. In the first phase of vectorization, we created a
9158     // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9159     // for users in the scalar preheader and exit block.
9160     //
9161     //   vector.ph:
9162     //     v_init = vector(..., ..., ..., a[-1])
9163     //     br vector.body
9164     //
9165     //   vector.body
9166     //     i = phi [0, vector.ph], [i+4, vector.body]
9167     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
9168     //     v2 = a[i, i+1, i+2, i+3]
9169     //     b[i] = v2 - v1
9170     //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9171     //     b[i, i+1, i+2, i+3] = v2 - v1
9172     //     br cond, vector.body, middle.block
9173     //
9174     //   middle.block:
9175     //     vector.recur.extract.for.phi = v2(2)
9176     //     vector.recur.extract = v2(3)
9177     //     br cond, scalar.ph, exit.block
9178     //
9179     //   scalar.ph:
9180     //     scalar.recur.init = phi [vector.recur.extract, middle.block],
9181     //                             [s.init, otherwise]
9182     //     br scalar.body
9183     //
9184     //   scalar.body:
9185     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9186     //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9187     //     s2 = a[i]
9188     //     b[i] = s2 - s1
9189     //     br cond, scalar.body, exit.block
9190     //
9191     //   exit.block:
9192     //     lo = lcssa.phi [s1, scalar.body],
9193     //                    [vector.recur.extract.for.phi, middle.block]
9194     //
9195     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9196     // Extract the penultimate value of the recurrence and use it as operand for
9197     // the VPIRInstruction modeling the phi.
9198     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9199       if (ExitIRI->getOperand(0) != FOR)
9200         continue;
9201       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9202           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9203           "vector.recur.extract.for.phi");
9204       ExitIRI->setOperand(0, PenultimateElement);
9205       ExitUsersToFix.remove(ExitIRI);
9206     }
9207   }
9208 }
9209 
9210 VPlanPtr
9211 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9212 
9213   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9214 
9215   // ---------------------------------------------------------------------------
9216   // Build initial VPlan: Scan the body of the loop in a topological order to
9217   // visit each basic block after having visited its predecessor basic blocks.
9218   // ---------------------------------------------------------------------------
9219 
9220   // Create initial VPlan skeleton, having a basic block for the pre-header
9221   // which contains SCEV expansions that need to happen before the CFG is
9222   // modified; a basic block for the vector pre-header, followed by a region for
9223   // the vector loop, followed by the middle basic block. The skeleton vector
9224   // loop region contains a header and latch basic blocks.
9225 
9226   bool RequiresScalarEpilogueCheck =
9227       LoopVectorizationPlanner::getDecisionAndClampRange(
9228           [this](ElementCount VF) {
9229             return !CM.requiresScalarEpilogue(VF.isVector());
9230           },
9231           Range);
9232   VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(),
9233                                             PSE, RequiresScalarEpilogueCheck,
9234                                             CM.foldTailByMasking(), OrigLoop);
9235 
9236   // Don't use getDecisionAndClampRange here, because we don't know the UF
9237   // so this function is better to be conservative, rather than to split
9238   // it up into different VPlans.
9239   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9240   bool IVUpdateMayOverflow = false;
9241   for (ElementCount VF : Range)
9242     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9243 
9244   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9245   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9246   // Use NUW for the induction increment if we proved that it won't overflow in
9247   // the vector loop or when not folding the tail. In the later case, we know
9248   // that the canonical induction increment will not overflow as the vector trip
9249   // count is >= increment and a multiple of the increment.
9250   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9251   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9252 
9253   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9254                                 Builder);
9255 
9256   // ---------------------------------------------------------------------------
9257   // Pre-construction: record ingredients whose recipes we'll need to further
9258   // process after constructing the initial VPlan.
9259   // ---------------------------------------------------------------------------
9260 
9261   // For each interleave group which is relevant for this (possibly trimmed)
9262   // Range, add it to the set of groups to be later applied to the VPlan and add
9263   // placeholders for its members' Recipes which we'll be replacing with a
9264   // single VPInterleaveRecipe.
9265   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9266     auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9267       bool Result = (VF.isVector() && // Query is illegal for VF == 1
9268                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
9269                          LoopVectorizationCostModel::CM_Interleave);
9270       // For scalable vectors, the only interleave factor currently supported
9271       // is 2 since we require the (de)interleave2 intrinsics instead of
9272       // shufflevectors.
9273       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9274              "Unsupported interleave factor for scalable vectors");
9275       return Result;
9276     };
9277     if (!getDecisionAndClampRange(ApplyIG, Range))
9278       continue;
9279     InterleaveGroups.insert(IG);
9280   }
9281 
9282   // ---------------------------------------------------------------------------
9283   // Construct recipes for the instructions in the loop
9284   // ---------------------------------------------------------------------------
9285 
9286   // Scan the body of the loop in a topological order to visit each basic block
9287   // after having visited its predecessor basic blocks.
9288   LoopBlocksDFS DFS(OrigLoop);
9289   DFS.perform(LI);
9290 
9291   VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9292   VPBasicBlock *VPBB = HeaderVPBB;
9293   BasicBlock *HeaderBB = OrigLoop->getHeader();
9294   bool NeedsMasks =
9295       CM.foldTailByMasking() ||
9296       any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9297         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9298         return Legal->blockNeedsPredication(BB) || NeedsBlends;
9299       });
9300 
9301   RecipeBuilder.collectScaledReductions(Range);
9302 
9303   auto *MiddleVPBB = Plan->getMiddleBlock();
9304   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9305   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9306     // Relevant instructions from basic block BB will be grouped into VPRecipe
9307     // ingredients and fill a new VPBasicBlock.
9308     if (VPBB != HeaderVPBB)
9309       VPBB->setName(BB->getName());
9310     Builder.setInsertPoint(VPBB);
9311 
9312     if (VPBB == HeaderVPBB)
9313       RecipeBuilder.createHeaderMask();
9314     else if (NeedsMasks)
9315       RecipeBuilder.createBlockInMask(BB);
9316 
9317     // Introduce each ingredient into VPlan.
9318     // TODO: Model and preserve debug intrinsics in VPlan.
9319     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9320       Instruction *Instr = &I;
9321       SmallVector<VPValue *, 4> Operands;
9322       auto *Phi = dyn_cast<PHINode>(Instr);
9323       if (Phi && Phi->getParent() == HeaderBB) {
9324         Operands.push_back(Plan->getOrAddLiveIn(
9325             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9326       } else {
9327         auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9328         Operands = {OpRange.begin(), OpRange.end()};
9329       }
9330 
9331       // The stores with invariant address inside the loop will be deleted, and
9332       // in the exit block, a uniform store recipe will be created for the final
9333       // invariant store of the reduction.
9334       StoreInst *SI;
9335       if ((SI = dyn_cast<StoreInst>(&I)) &&
9336           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9337         // Only create recipe for the final invariant store of the reduction.
9338         if (!Legal->isInvariantStoreOfReduction(SI))
9339           continue;
9340         auto *Recipe = new VPReplicateRecipe(
9341             SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9342             true /* IsUniform */);
9343         Recipe->insertBefore(*MiddleVPBB, MBIP);
9344         continue;
9345       }
9346 
9347       VPRecipeBase *Recipe =
9348           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9349       if (!Recipe)
9350         Recipe = RecipeBuilder.handleReplication(Instr, Range);
9351 
9352       RecipeBuilder.setRecipe(Instr, Recipe);
9353       if (isa<VPHeaderPHIRecipe>(Recipe)) {
9354         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9355         // the following cases, VPHeaderPHIRecipes may be created after non-phi
9356         // recipes and need to be moved to the phi section of HeaderVPBB:
9357         // * tail-folding (non-phi recipes computing the header mask are
9358         // introduced earlier than regular header phi recipes, and should appear
9359         // after them)
9360         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9361 
9362         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9363                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9364                "unexpected recipe needs moving");
9365         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9366       } else
9367         VPBB->appendRecipe(Recipe);
9368     }
9369 
9370     VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9371     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9372   }
9373 
9374   // After here, VPBB should not be used.
9375   VPBB = nullptr;
9376 
9377   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9378          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9379          "entry block must be set to a VPRegionBlock having a non-empty entry "
9380          "VPBasicBlock");
9381   RecipeBuilder.fixHeaderPhis();
9382 
9383   // Update wide induction increments to use the same step as the corresponding
9384   // wide induction. This enables detecting induction increments directly in
9385   // VPlan and removes redundant splats.
9386   for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9387     auto *IVInc = cast<Instruction>(
9388         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9389     if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9390       continue;
9391     VPWidenInductionRecipe *WideIV =
9392         cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9393     VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9394     R->setOperand(1, WideIV->getStepValue());
9395   }
9396 
9397   if (auto *UncountableExitingBlock =
9398           Legal->getUncountableEarlyExitingBlock()) {
9399     if (!VPlanTransforms::handleUncountableEarlyExit(
9400             *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock,
9401             RecipeBuilder)) {
9402       reportVectorizationFailure(
9403           "Some exit values in loop with uncountable exit not supported yet",
9404           "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9405       return nullptr;
9406     }
9407   }
9408   DenseMap<VPValue *, VPValue *> IVEndValues;
9409   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9410   SetVector<VPIRInstruction *> ExitUsersToFix =
9411       collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9412   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9413   addUsersInExitBlocks(*Plan, ExitUsersToFix);
9414 
9415   // ---------------------------------------------------------------------------
9416   // Transform initial VPlan: Apply previously taken decisions, in order, to
9417   // bring the VPlan to its final state.
9418   // ---------------------------------------------------------------------------
9419 
9420   // Adjust the recipes for any inloop reductions.
9421   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9422 
9423   // Interleave memory: for each Interleave Group we marked earlier as relevant
9424   // for this VPlan, replace the Recipes widening its memory instructions with a
9425   // single VPInterleaveRecipe at its insertion point.
9426   VPlanTransforms::createInterleaveGroups(
9427       *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
9428 
9429   for (ElementCount VF : Range)
9430     Plan->addVF(VF);
9431   Plan->setName("Initial VPlan");
9432 
9433   // Replace VPValues for known constant strides guaranteed by predicate scalar
9434   // evolution.
9435   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9436     auto *R = cast<VPRecipeBase>(&U);
9437     return R->getParent()->getParent() ||
9438            R->getParent() ==
9439                Plan->getVectorLoopRegion()->getSinglePredecessor();
9440   };
9441   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9442     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9443     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9444     // Only handle constant strides for now.
9445     if (!ScevStride)
9446       continue;
9447 
9448     auto *CI = Plan->getOrAddLiveIn(
9449         ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9450     if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9451       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9452 
9453     // The versioned value may not be used in the loop directly but through a
9454     // sext/zext. Add new live-ins in those cases.
9455     for (Value *U : StrideV->users()) {
9456       if (!isa<SExtInst, ZExtInst>(U))
9457         continue;
9458       VPValue *StrideVPV = Plan->getLiveIn(U);
9459       if (!StrideVPV)
9460         continue;
9461       unsigned BW = U->getType()->getScalarSizeInBits();
9462       APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9463                                  : ScevStride->getAPInt().zext(BW);
9464       VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9465       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9466     }
9467   }
9468 
9469   VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
9470     return Legal->blockNeedsPredication(BB);
9471   });
9472 
9473   // Sink users of fixed-order recurrence past the recipe defining the previous
9474   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9475   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
9476     return nullptr;
9477 
9478   if (useActiveLaneMask(Style)) {
9479     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9480     // TailFoldingStyle is visible there.
9481     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9482     bool WithoutRuntimeCheck =
9483         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
9484     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9485                                        WithoutRuntimeCheck);
9486   }
9487   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
9488 
9489   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9490   return Plan;
9491 }
9492 
9493 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9494   // Outer loop handling: They may require CFG and instruction level
9495   // transformations before even evaluating whether vectorization is profitable.
9496   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9497   // the vectorization pipeline.
9498   assert(!OrigLoop->isInnermost());
9499   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9500 
9501   // Create new empty VPlan
9502   auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9503                                         true, false, OrigLoop);
9504 
9505   // Build hierarchical CFG
9506   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9507   HCFGBuilder.buildHierarchicalCFG();
9508 
9509   for (ElementCount VF : Range)
9510     Plan->addVF(VF);
9511 
9512   VPlanTransforms::VPInstructionsToVPRecipes(
9513       Plan,
9514       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9515       *PSE.getSE(), *TLI);
9516 
9517   // Tail folding is not supported for outer loops, so the induction increment
9518   // is guaranteed to not wrap.
9519   bool HasNUW = true;
9520   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9521                         DebugLoc());
9522 
9523   // Collect mapping of IR header phis to header phi recipes, to be used in
9524   // addScalarResumePhis.
9525   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9526                                 Builder);
9527   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9528     if (isa<VPCanonicalIVPHIRecipe>(&R))
9529       continue;
9530     auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9531     RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9532   }
9533   DenseMap<VPValue *, VPValue *> IVEndValues;
9534   // TODO: IVEndValues are not used yet in the native path, to optimize exit
9535   // values.
9536   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9537 
9538   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9539   return Plan;
9540 }
9541 
9542 // Adjust the recipes for reductions. For in-loop reductions the chain of
9543 // instructions leading from the loop exit instr to the phi need to be converted
9544 // to reductions, with one operand being vector and the other being the scalar
9545 // reduction chain. For other reductions, a select is introduced between the phi
9546 // and users outside the vector region when folding the tail.
9547 //
9548 // A ComputeReductionResult recipe is added to the middle block, also for
9549 // in-loop reductions which compute their result in-loop, because generating
9550 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9551 //
9552 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9553 // with a boolean reduction phi node to check if the condition is true in any
9554 // iteration. The final value is selected by the final ComputeReductionResult.
9555 void LoopVectorizationPlanner::adjustRecipesForReductions(
9556     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9557   using namespace VPlanPatternMatch;
9558   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9559   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9560   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9561   SmallVector<VPRecipeBase *> ToDelete;
9562 
9563   for (VPRecipeBase &R : Header->phis()) {
9564     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9565     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9566       continue;
9567 
9568     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9569     RecurKind Kind = RdxDesc.getRecurrenceKind();
9570     assert(
9571         !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9572         !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
9573         "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9574 
9575     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9576     SetVector<VPSingleDefRecipe *> Worklist;
9577     Worklist.insert(PhiR);
9578     for (unsigned I = 0; I != Worklist.size(); ++I) {
9579       VPSingleDefRecipe *Cur = Worklist[I];
9580       for (VPUser *U : Cur->users()) {
9581         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9582         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9583           assert((UserRecipe->getParent() == MiddleVPBB ||
9584                   UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9585                  "U must be either in the loop region, the middle block or the "
9586                  "scalar preheader.");
9587           continue;
9588         }
9589         Worklist.insert(UserRecipe);
9590       }
9591     }
9592 
9593     // Visit operation "Links" along the reduction chain top-down starting from
9594     // the phi until LoopExitValue. We keep track of the previous item
9595     // (PreviousLink) to tell which of the two operands of a Link will remain
9596     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9597     // the select instructions. Blend recipes of in-loop reduction phi's  will
9598     // get folded to their non-phi operand, as the reduction recipe handles the
9599     // condition directly.
9600     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9601     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9602       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9603 
9604       // Index of the first operand which holds a non-mask vector operand.
9605       unsigned IndexOfFirstOperand;
9606       // Recognize a call to the llvm.fmuladd intrinsic.
9607       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9608       VPValue *VecOp;
9609       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9610       if (IsFMulAdd) {
9611         assert(
9612             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9613             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9614         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9615                 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9616                CurrentLink->getOperand(2) == PreviousLink &&
9617                "expected a call where the previous link is the added operand");
9618 
9619         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9620         // need to create an fmul recipe (multiplying the first two operands of
9621         // the fmuladd together) to use as the vector operand for the fadd
9622         // reduction.
9623         VPInstruction *FMulRecipe = new VPInstruction(
9624             Instruction::FMul,
9625             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9626             CurrentLinkI->getFastMathFlags());
9627         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9628         VecOp = FMulRecipe;
9629       } else {
9630         auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9631         if (PhiR->isInLoop() && Blend) {
9632           assert(Blend->getNumIncomingValues() == 2 &&
9633                  "Blend must have 2 incoming values");
9634           if (Blend->getIncomingValue(0) == PhiR)
9635             Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9636           else {
9637             assert(Blend->getIncomingValue(1) == PhiR &&
9638                    "PhiR must be an operand of the blend");
9639             Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9640           }
9641           continue;
9642         }
9643 
9644         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9645           if (isa<VPWidenRecipe>(CurrentLink)) {
9646             assert(isa<CmpInst>(CurrentLinkI) &&
9647                    "need to have the compare of the select");
9648             continue;
9649           }
9650           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9651                  "must be a select recipe");
9652           IndexOfFirstOperand = 1;
9653         } else {
9654           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9655                  "Expected to replace a VPWidenSC");
9656           IndexOfFirstOperand = 0;
9657         }
9658         // Note that for non-commutable operands (cmp-selects), the semantics of
9659         // the cmp-select are captured in the recurrence kind.
9660         unsigned VecOpId =
9661             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9662                 ? IndexOfFirstOperand + 1
9663                 : IndexOfFirstOperand;
9664         VecOp = CurrentLink->getOperand(VecOpId);
9665         assert(VecOp != PreviousLink &&
9666                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9667                                        (VecOpId - IndexOfFirstOperand)) ==
9668                    PreviousLink &&
9669                "PreviousLink must be the operand other than VecOp");
9670       }
9671 
9672       BasicBlock *BB = CurrentLinkI->getParent();
9673       VPValue *CondOp = nullptr;
9674       if (CM.blockNeedsPredicationForAnyReason(BB))
9675         CondOp = RecipeBuilder.getBlockInMask(BB);
9676 
9677       auto *RedRecipe = new VPReductionRecipe(
9678           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9679           CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9680       // Append the recipe to the end of the VPBasicBlock because we need to
9681       // ensure that it comes after all of it's inputs, including CondOp.
9682       // Delete CurrentLink as it will be invalid if its operand is replaced
9683       // with a reduction defined at the bottom of the block in the next link.
9684       LinkVPBB->appendRecipe(RedRecipe);
9685       CurrentLink->replaceAllUsesWith(RedRecipe);
9686       ToDelete.push_back(CurrentLink);
9687       PreviousLink = RedRecipe;
9688     }
9689   }
9690   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9691   Builder.setInsertPoint(&*LatchVPBB->begin());
9692   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9693   for (VPRecipeBase &R :
9694        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9695     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9696     if (!PhiR)
9697       continue;
9698 
9699     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9700     // If tail is folded by masking, introduce selects between the phi
9701     // and the users outside the vector region of each reduction, at the
9702     // beginning of the dedicated latch block.
9703     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9704     auto *NewExitingVPV = PhiR->getBackedgeValue();
9705     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9706       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9707       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9708              "reduction recipe must be defined before latch");
9709       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9710       std::optional<FastMathFlags> FMFs =
9711           PhiTy->isFloatingPointTy()
9712               ? std::make_optional(RdxDesc.getFastMathFlags())
9713               : std::nullopt;
9714       NewExitingVPV =
9715           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9716       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9717         return isa<VPInstruction>(&U) &&
9718                cast<VPInstruction>(&U)->getOpcode() ==
9719                    VPInstruction::ComputeReductionResult;
9720       });
9721       if (CM.usePredicatedReductionSelect(
9722               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9723         PhiR->setOperand(1, NewExitingVPV);
9724     }
9725 
9726     // If the vector reduction can be performed in a smaller type, we truncate
9727     // then extend the loop exit value to enable InstCombine to evaluate the
9728     // entire expression in the smaller type.
9729     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9730     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9731         !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9732             RdxDesc.getRecurrenceKind())) {
9733       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9734       Type *RdxTy = RdxDesc.getRecurrenceType();
9735       auto *Trunc =
9736           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9737       auto *Extnd =
9738           RdxDesc.isSigned()
9739               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9740               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9741 
9742       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9743       Extnd->insertAfter(Trunc);
9744       if (PhiR->getOperand(1) == NewExitingVPV)
9745         PhiR->setOperand(1, Extnd->getVPSingleValue());
9746       NewExitingVPV = Extnd;
9747     }
9748 
9749     // We want code in the middle block to appear to execute on the location of
9750     // the scalar loop's latch terminator because: (a) it is all compiler
9751     // generated, (b) these instructions are always executed after evaluating
9752     // the latch conditional branch, and (c) other passes may add new
9753     // predecessors which terminate on this line. This is the easiest way to
9754     // ensure we don't accidentally cause an extra step back into the loop while
9755     // debugging.
9756     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9757 
9758     // TODO: At the moment ComputeReductionResult also drives creation of the
9759     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9760     // even for in-loop reductions, until the reduction resume value handling is
9761     // also modeled in VPlan.
9762     auto *FinalReductionResult = new VPInstruction(
9763         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9764     // Update all users outside the vector region.
9765     OrigExitingVPV->replaceUsesWithIf(
9766         FinalReductionResult, [](VPUser &User, unsigned) {
9767           auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9768           return Parent && !Parent->getParent();
9769         });
9770     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9771 
9772     // Adjust AnyOf reductions; replace the reduction phi for the selected value
9773     // with a boolean reduction phi node to check if the condition is true in
9774     // any iteration. The final value is selected by the final
9775     // ComputeReductionResult.
9776     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9777             RdxDesc.getRecurrenceKind())) {
9778       auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9779         return isa<VPWidenSelectRecipe>(U) ||
9780                (isa<VPReplicateRecipe>(U) &&
9781                 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9782                     Instruction::Select);
9783       }));
9784       VPValue *Cmp = Select->getOperand(0);
9785       // If the compare is checking the reduction PHI node, adjust it to check
9786       // the start value.
9787       if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9788         for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9789           if (CmpR->getOperand(I) == PhiR)
9790             CmpR->setOperand(I, PhiR->getStartValue());
9791       }
9792       VPBuilder::InsertPointGuard Guard(Builder);
9793       Builder.setInsertPoint(Select);
9794 
9795       // If the true value of the select is the reduction phi, the new value is
9796       // selected if the negated condition is true in any iteration.
9797       if (Select->getOperand(1) == PhiR)
9798         Cmp = Builder.createNot(Cmp);
9799       VPValue *Or = Builder.createOr(PhiR, Cmp);
9800       Select->getVPSingleValue()->replaceAllUsesWith(Or);
9801       // Delete Select now that it has invalid types.
9802       ToDelete.push_back(Select);
9803 
9804       // Convert the reduction phi to operate on bools.
9805       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9806                               OrigLoop->getHeader()->getContext())));
9807       continue;
9808     }
9809 
9810     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
9811             RdxDesc.getRecurrenceKind())) {
9812       // Adjust the start value for FindLastIV recurrences to use the sentinel
9813       // value after generating the ResumePhi recipe, which uses the original
9814       // start value.
9815       PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9816     }
9817   }
9818 
9819   VPlanTransforms::clearReductionWrapFlags(*Plan);
9820   for (VPRecipeBase *R : ToDelete)
9821     R->eraseFromParent();
9822 }
9823 
9824 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9825   assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9826 
9827   // Fast-math-flags propagate from the original induction instruction.
9828   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9829   if (FPBinOp)
9830     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9831 
9832   Value *Step = State.get(getStepValue(), VPLane(0));
9833   Value *Index = State.get(getOperand(1), VPLane(0));
9834   Value *DerivedIV = emitTransformedIndex(
9835       State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9836       cast_if_present<BinaryOperator>(FPBinOp));
9837   DerivedIV->setName(Name);
9838   // If index is the vector trip count, the concrete value will only be set in
9839   // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9840   // TODO: Remove the special case for the vector trip count once it is computed
9841   // in VPlan and can be used during VPlan simplification.
9842   assert((DerivedIV != Index ||
9843           getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9844          "IV didn't need transforming?");
9845   State.set(this, DerivedIV, VPLane(0));
9846 }
9847 
9848 void VPReplicateRecipe::execute(VPTransformState &State) {
9849   Instruction *UI = getUnderlyingInstr();
9850   if (State.Lane) { // Generate a single instance.
9851     assert((State.VF.isScalar() || !isUniform()) &&
9852            "uniform recipe shouldn't be predicated");
9853     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9854     State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
9855     // Insert scalar instance packing it into a vector.
9856     if (State.VF.isVector() && shouldPack()) {
9857       // If we're constructing lane 0, initialize to start from poison.
9858       if (State.Lane->isFirstLane()) {
9859         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9860         Value *Poison = PoisonValue::get(
9861             VectorType::get(UI->getType(), State.VF));
9862         State.set(this, Poison);
9863       }
9864       State.packScalarIntoVectorValue(this, *State.Lane);
9865     }
9866     return;
9867   }
9868 
9869   if (IsUniform) {
9870     // Uniform within VL means we need to generate lane 0.
9871     State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
9872     return;
9873   }
9874 
9875   // A store of a loop varying value to a uniform address only needs the last
9876   // copy of the store.
9877   if (isa<StoreInst>(UI) &&
9878       vputils::isUniformAfterVectorization(getOperand(1))) {
9879     auto Lane = VPLane::getLastLaneForVF(State.VF);
9880     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9881     return;
9882   }
9883 
9884   // Generate scalar instances for all VF lanes.
9885   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9886   const unsigned EndLane = State.VF.getKnownMinValue();
9887   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9888     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9889 }
9890 
9891 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9892 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9893 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9894 // for predication.
9895 static ScalarEpilogueLowering getScalarEpilogueLowering(
9896     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9897     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9898     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9899   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9900   // don't look at hints or options, and don't request a scalar epilogue.
9901   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9902   // LoopAccessInfo (due to code dependency and not being able to reliably get
9903   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9904   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9905   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9906   // back to the old way and vectorize with versioning when forced. See D81345.)
9907   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9908                                                       PGSOQueryType::IRPass) &&
9909                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9910     return CM_ScalarEpilogueNotAllowedOptSize;
9911 
9912   // 2) If set, obey the directives
9913   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9914     switch (PreferPredicateOverEpilogue) {
9915     case PreferPredicateTy::ScalarEpilogue:
9916       return CM_ScalarEpilogueAllowed;
9917     case PreferPredicateTy::PredicateElseScalarEpilogue:
9918       return CM_ScalarEpilogueNotNeededUsePredicate;
9919     case PreferPredicateTy::PredicateOrDontVectorize:
9920       return CM_ScalarEpilogueNotAllowedUsePredicate;
9921     };
9922   }
9923 
9924   // 3) If set, obey the hints
9925   switch (Hints.getPredicate()) {
9926   case LoopVectorizeHints::FK_Enabled:
9927     return CM_ScalarEpilogueNotNeededUsePredicate;
9928   case LoopVectorizeHints::FK_Disabled:
9929     return CM_ScalarEpilogueAllowed;
9930   };
9931 
9932   // 4) if the TTI hook indicates this is profitable, request predication.
9933   TailFoldingInfo TFI(TLI, &LVL, IAI);
9934   if (TTI->preferPredicateOverEpilogue(&TFI))
9935     return CM_ScalarEpilogueNotNeededUsePredicate;
9936 
9937   return CM_ScalarEpilogueAllowed;
9938 }
9939 
9940 // Process the loop in the VPlan-native vectorization path. This path builds
9941 // VPlan upfront in the vectorization pipeline, which allows to apply
9942 // VPlan-to-VPlan transformations from the very beginning without modifying the
9943 // input LLVM IR.
9944 static bool processLoopInVPlanNativePath(
9945     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9946     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9947     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9948     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9949     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9950     LoopVectorizationRequirements &Requirements) {
9951 
9952   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9953     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9954     return false;
9955   }
9956   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9957   Function *F = L->getHeader()->getParent();
9958   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9959 
9960   ScalarEpilogueLowering SEL =
9961       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9962 
9963   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9964                                 &Hints, IAI);
9965   // Use the planner for outer loop vectorization.
9966   // TODO: CM is not used at this point inside the planner. Turn CM into an
9967   // optional argument if we don't need it in the future.
9968   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9969                                ORE);
9970 
9971   // Get user vectorization factor.
9972   ElementCount UserVF = Hints.getWidth();
9973 
9974   CM.collectElementTypesForWidening();
9975 
9976   // Plan how to best vectorize, return the best VF and its cost.
9977   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9978 
9979   // If we are stress testing VPlan builds, do not attempt to generate vector
9980   // code. Masked vector code generation support will follow soon.
9981   // Also, do not attempt to vectorize if no vector code will be produced.
9982   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9983     return false;
9984 
9985   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9986 
9987   {
9988     bool AddBranchWeights =
9989         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9990     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
9991                              AddBranchWeights, CM.CostKind);
9992     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9993                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
9994     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9995                       << L->getHeader()->getParent()->getName() << "\"\n");
9996     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9997   }
9998 
9999   reportVectorization(ORE, L, VF, 1);
10000 
10001   // Mark the loop as already vectorized to avoid vectorizing again.
10002   Hints.setAlreadyVectorized();
10003   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10004   return true;
10005 }
10006 
10007 // Emit a remark if there are stores to floats that required a floating point
10008 // extension. If the vectorized loop was generated with floating point there
10009 // will be a performance penalty from the conversion overhead and the change in
10010 // the vector width.
10011 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10012   SmallVector<Instruction *, 4> Worklist;
10013   for (BasicBlock *BB : L->getBlocks()) {
10014     for (Instruction &Inst : *BB) {
10015       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10016         if (S->getValueOperand()->getType()->isFloatTy())
10017           Worklist.push_back(S);
10018       }
10019     }
10020   }
10021 
10022   // Traverse the floating point stores upwards searching, for floating point
10023   // conversions.
10024   SmallPtrSet<const Instruction *, 4> Visited;
10025   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10026   while (!Worklist.empty()) {
10027     auto *I = Worklist.pop_back_val();
10028     if (!L->contains(I))
10029       continue;
10030     if (!Visited.insert(I).second)
10031       continue;
10032 
10033     // Emit a remark if the floating point store required a floating
10034     // point conversion.
10035     // TODO: More work could be done to identify the root cause such as a
10036     // constant or a function return type and point the user to it.
10037     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10038       ORE->emit([&]() {
10039         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10040                                           I->getDebugLoc(), L->getHeader())
10041                << "floating point conversion changes vector width. "
10042                << "Mixed floating point precision requires an up/down "
10043                << "cast that will negatively impact performance.";
10044       });
10045 
10046     for (Use &Op : I->operands())
10047       if (auto *OpI = dyn_cast<Instruction>(Op))
10048         Worklist.push_back(OpI);
10049   }
10050 }
10051 
10052 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10053                                        VectorizationFactor &VF, Loop *L,
10054                                        const TargetTransformInfo &TTI,
10055                                        PredicatedScalarEvolution &PSE,
10056                                        ScalarEpilogueLowering SEL) {
10057   InstructionCost CheckCost = Checks.getCost();
10058   if (!CheckCost.isValid())
10059     return false;
10060 
10061   // When interleaving only scalar and vector cost will be equal, which in turn
10062   // would lead to a divide by 0. Fall back to hard threshold.
10063   if (VF.Width.isScalar()) {
10064     if (CheckCost > VectorizeMemoryCheckThreshold) {
10065       LLVM_DEBUG(
10066           dbgs()
10067           << "LV: Interleaving only is not profitable due to runtime checks\n");
10068       return false;
10069     }
10070     return true;
10071   }
10072 
10073   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10074   uint64_t ScalarC = *VF.ScalarCost.getValue();
10075   if (ScalarC == 0)
10076     return true;
10077 
10078   // First, compute the minimum iteration count required so that the vector
10079   // loop outperforms the scalar loop.
10080   //  The total cost of the scalar loop is
10081   //   ScalarC * TC
10082   //  where
10083   //  * TC is the actual trip count of the loop.
10084   //  * ScalarC is the cost of a single scalar iteration.
10085   //
10086   //  The total cost of the vector loop is
10087   //    RtC + VecC * (TC / VF) + EpiC
10088   //  where
10089   //  * RtC is the cost of the generated runtime checks
10090   //  * VecC is the cost of a single vector iteration.
10091   //  * TC is the actual trip count of the loop
10092   //  * VF is the vectorization factor
10093   //  * EpiCost is the cost of the generated epilogue, including the cost
10094   //    of the remaining scalar operations.
10095   //
10096   // Vectorization is profitable once the total vector cost is less than the
10097   // total scalar cost:
10098   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10099   //
10100   // Now we can compute the minimum required trip count TC as
10101   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10102   //
10103   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10104   // the computations are performed on doubles, not integers and the result
10105   // is rounded up, hence we get an upper estimate of the TC.
10106   unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10107   uint64_t RtC = *CheckCost.getValue();
10108   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10109   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10110 
10111   // Second, compute a minimum iteration count so that the cost of the
10112   // runtime checks is only a fraction of the total scalar loop cost. This
10113   // adds a loop-dependent bound on the overhead incurred if the runtime
10114   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10115   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10116   // cost, compute
10117   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10118   uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10119 
10120   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10121   // epilogue is allowed, choose the next closest multiple of VF. This should
10122   // partly compensate for ignoring the epilogue cost.
10123   uint64_t MinTC = std::max(MinTC1, MinTC2);
10124   if (SEL == CM_ScalarEpilogueAllowed)
10125     MinTC = alignTo(MinTC, IntVF);
10126   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
10127 
10128   LLVM_DEBUG(
10129       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10130              << VF.MinProfitableTripCount << "\n");
10131 
10132   // Skip vectorization if the expected trip count is less than the minimum
10133   // required trip count.
10134   if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10135     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10136                                 VF.MinProfitableTripCount)) {
10137       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10138                            "trip count < minimum profitable VF ("
10139                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10140                         << ")\n");
10141 
10142       return false;
10143     }
10144   }
10145   return true;
10146 }
10147 
10148 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10149     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10150                                !EnableLoopInterleaving),
10151       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10152                               !EnableLoopVectorization) {}
10153 
10154 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10155 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10156 /// don't have a corresponding wide induction in \p EpiPlan.
10157 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10158   // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10159   // will need their resume-values computed in the main vector loop. Others
10160   // can be removed from the main VPlan.
10161   SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10162   for (VPRecipeBase &R :
10163        EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
10164     if (isa<VPCanonicalIVPHIRecipe>(&R))
10165       continue;
10166     EpiWidenedPhis.insert(
10167         cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10168   }
10169   for (VPRecipeBase &R : make_early_inc_range(
10170            *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10171     auto *VPIRInst = cast<VPIRInstruction>(&R);
10172     auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10173     if (!IRI)
10174       break;
10175     if (EpiWidenedPhis.contains(IRI))
10176       continue;
10177     // There is no corresponding wide induction in the epilogue plan that would
10178     // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10179     // together with the corresponding ResumePhi. The resume values for the
10180     // scalar loop will be created during execution of EpiPlan.
10181     VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10182     VPIRInst->eraseFromParent();
10183     ResumePhi->eraseFromParent();
10184   }
10185   VPlanTransforms::removeDeadRecipes(MainPlan);
10186 
10187   using namespace VPlanPatternMatch;
10188   VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10189   VPValue *VectorTC = &MainPlan.getVectorTripCount();
10190   // If there is a suitable resume value for the canonical induction in the
10191   // scalar (which will become vector) epilogue loop we are done. Otherwise
10192   // create it below.
10193   if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10194         return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10195                              m_Specific(VectorTC), m_SpecificInt(0)));
10196       }))
10197     return;
10198   VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10199   ScalarPHBuilder.createNaryOp(
10200       VPInstruction::ResumePhi,
10201       {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10202       "vec.epilog.resume.val");
10203 }
10204 
10205 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10206 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10207 static void
10208 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
10209                                  const SCEV2ValueTy &ExpandedSCEVs,
10210                                  const EpilogueLoopVectorizationInfo &EPI) {
10211   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10212   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10213   Header->setName("vec.epilog.vector.body");
10214 
10215   // Re-use the trip count and steps expanded for the main loop, as
10216   // skeleton creation needs it as a value that dominates both the scalar
10217   // and vector epilogue loops
10218   // TODO: This is a workaround needed for epilogue vectorization and it
10219   // should be removed once induction resume value creation is done
10220   // directly in VPlan.
10221   for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10222     auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10223     if (!ExpandR)
10224       continue;
10225     auto *ExpandedVal =
10226         Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10227     ExpandR->replaceAllUsesWith(ExpandedVal);
10228     if (Plan.getTripCount() == ExpandR)
10229       Plan.resetTripCount(ExpandedVal);
10230     ExpandR->eraseFromParent();
10231   }
10232 
10233   // Ensure that the start values for all header phi recipes are updated before
10234   // vectorizing the epilogue loop.
10235   for (VPRecipeBase &R : Header->phis()) {
10236     if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10237       // When vectorizing the epilogue loop, the canonical induction start
10238       // value needs to be changed from zero to the value after the main
10239       // vector loop. Find the resume value created during execution of the main
10240       // VPlan.
10241       // FIXME: Improve modeling for canonical IV start values in the epilogue
10242       // loop.
10243       BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10244           predecessors(L->getLoopPreheader()),
10245           [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10246             if (BB != EPI.MainLoopIterationCountCheck &&
10247                 BB != EPI.EpilogueIterationCountCheck &&
10248                 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10249               return BB;
10250             return nullptr;
10251           });
10252       using namespace llvm::PatternMatch;
10253       Type *IdxTy = IV->getScalarType();
10254       PHINode *EPResumeVal = find_singleton<PHINode>(
10255           L->getLoopPreheader()->phis(),
10256           [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10257             if (P.getType() == IdxTy &&
10258                 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10259                 match(
10260                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10261                     m_SpecificInt(0)))
10262               return &P;
10263             return nullptr;
10264           });
10265       assert(EPResumeVal && "must have a resume value for the canonical IV");
10266       VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10267       assert(all_of(IV->users(),
10268                     [](const VPUser *U) {
10269                       return isa<VPScalarIVStepsRecipe>(U) ||
10270                              isa<VPScalarCastRecipe>(U) ||
10271                              isa<VPDerivedIVRecipe>(U) ||
10272                              cast<VPInstruction>(U)->getOpcode() ==
10273                                  Instruction::Add;
10274                     }) &&
10275              "the canonical IV should only be used by its increment or "
10276              "ScalarIVSteps when resetting the start value");
10277       IV->setOperand(0, VPV);
10278       continue;
10279     }
10280 
10281     Value *ResumeV = nullptr;
10282     // TODO: Move setting of resume values to prepareToExecute.
10283     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10284       ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10285                     ->getIncomingValueForBlock(L->getLoopPreheader());
10286       const RecurrenceDescriptor &RdxDesc =
10287           ReductionPhi->getRecurrenceDescriptor();
10288       RecurKind RK = RdxDesc.getRecurrenceKind();
10289       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
10290         // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10291         // start value; compare the final value from the main vector loop
10292         // to the start value.
10293         BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
10294         IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
10295         ResumeV =
10296             Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10297       } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
10298         // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10299         // to the resume value. The resume value is adjusted to the sentinel
10300         // value when the final value from the main vector loop equals the start
10301         // value. This ensures correctness when the start value might not be
10302         // less than the minimum value of a monotonically increasing induction
10303         // variable.
10304         BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
10305         IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
10306         Value *Cmp =
10307             Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10308         ResumeV =
10309             Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10310       }
10311     } else {
10312       // Retrieve the induction resume values for wide inductions from
10313       // their original phi nodes in the scalar loop.
10314       PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10315       // Hook up to the PHINode generated by a ResumePhi recipe of main
10316       // loop VPlan, which feeds the scalar loop.
10317       ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10318     }
10319     assert(ResumeV && "Must have a resume value");
10320     VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10321     cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10322   }
10323 }
10324 
10325 bool LoopVectorizePass::processLoop(Loop *L) {
10326   assert((EnableVPlanNativePath || L->isInnermost()) &&
10327          "VPlan-native path is not enabled. Only process inner loops.");
10328 
10329   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10330                     << L->getHeader()->getParent()->getName() << "' from "
10331                     << L->getLocStr() << "\n");
10332 
10333   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10334 
10335   LLVM_DEBUG(
10336       dbgs() << "LV: Loop hints:"
10337              << " force="
10338              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10339                      ? "disabled"
10340                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10341                             ? "enabled"
10342                             : "?"))
10343              << " width=" << Hints.getWidth()
10344              << " interleave=" << Hints.getInterleave() << "\n");
10345 
10346   // Function containing loop
10347   Function *F = L->getHeader()->getParent();
10348 
10349   // Looking at the diagnostic output is the only way to determine if a loop
10350   // was vectorized (other than looking at the IR or machine code), so it
10351   // is important to generate an optimization remark for each loop. Most of
10352   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10353   // generated as OptimizationRemark and OptimizationRemarkMissed are
10354   // less verbose reporting vectorized loops and unvectorized loops that may
10355   // benefit from vectorization, respectively.
10356 
10357   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10358     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10359     return false;
10360   }
10361 
10362   PredicatedScalarEvolution PSE(*SE, *L);
10363 
10364   // Check if it is legal to vectorize the loop.
10365   LoopVectorizationRequirements Requirements;
10366   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10367                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10368   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10369     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10370     Hints.emitRemarkWithHints();
10371     return false;
10372   }
10373 
10374   if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10375     reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10376                                "early exit is not enabled",
10377                                "UncountableEarlyExitLoopsDisabled", ORE, L);
10378     return false;
10379   }
10380 
10381   if (LVL.hasStructVectorCall()) {
10382     reportVectorizationFailure("Auto-vectorization of calls that return struct "
10383                                "types is not yet supported",
10384                                "StructCallVectorizationUnsupported", ORE, L);
10385     return false;
10386   }
10387 
10388   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10389   // here. They may require CFG and instruction level transformations before
10390   // even evaluating whether vectorization is profitable. Since we cannot modify
10391   // the incoming IR, we need to build VPlan upfront in the vectorization
10392   // pipeline.
10393   if (!L->isInnermost())
10394     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10395                                         ORE, BFI, PSI, Hints, Requirements);
10396 
10397   assert(L->isInnermost() && "Inner loop expected.");
10398 
10399   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10400   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10401 
10402   // If an override option has been passed in for interleaved accesses, use it.
10403   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10404     UseInterleaved = EnableInterleavedMemAccesses;
10405 
10406   // Analyze interleaved memory accesses.
10407   if (UseInterleaved)
10408     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10409 
10410   if (LVL.hasUncountableEarlyExit()) {
10411     BasicBlock *LoopLatch = L->getLoopLatch();
10412     if (IAI.requiresScalarEpilogue() ||
10413         any_of(LVL.getCountableExitingBlocks(),
10414                [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10415       reportVectorizationFailure("Auto-vectorization of early exit loops "
10416                                  "requiring a scalar epilogue is unsupported",
10417                                  "UncountableEarlyExitUnsupported", ORE, L);
10418       return false;
10419     }
10420   }
10421 
10422   // Check the function attributes and profiles to find out if this function
10423   // should be optimized for size.
10424   ScalarEpilogueLowering SEL =
10425       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10426 
10427   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10428   // count by optimizing for size, to minimize overheads.
10429   auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10430   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10431     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10432                       << "This loop is worth vectorizing only if no scalar "
10433                       << "iteration overheads are incurred.");
10434     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10435       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10436     else {
10437       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10438         LLVM_DEBUG(dbgs() << "\n");
10439         // Predicate tail-folded loops are efficient even when the loop
10440         // iteration count is low. However, setting the epilogue policy to
10441         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10442         // with runtime checks. It's more effective to let
10443         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10444         // for the loop.
10445         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10446           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10447       } else {
10448         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10449                              "small to consider vectorizing.\n");
10450         reportVectorizationFailure(
10451             "The trip count is below the minial threshold value.",
10452             "loop trip count is too low, avoiding vectorization",
10453             "LowTripCount", ORE, L);
10454         Hints.emitRemarkWithHints();
10455         return false;
10456       }
10457     }
10458   }
10459 
10460   // Check the function attributes to see if implicit floats or vectors are
10461   // allowed.
10462   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10463     reportVectorizationFailure(
10464         "Can't vectorize when the NoImplicitFloat attribute is used",
10465         "loop not vectorized due to NoImplicitFloat attribute",
10466         "NoImplicitFloat", ORE, L);
10467     Hints.emitRemarkWithHints();
10468     return false;
10469   }
10470 
10471   // Check if the target supports potentially unsafe FP vectorization.
10472   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10473   // for the target we're vectorizing for, to make sure none of the
10474   // additional fp-math flags can help.
10475   if (Hints.isPotentiallyUnsafe() &&
10476       TTI->isFPVectorizationPotentiallyUnsafe()) {
10477     reportVectorizationFailure(
10478         "Potentially unsafe FP op prevents vectorization",
10479         "loop not vectorized due to unsafe FP support.",
10480         "UnsafeFP", ORE, L);
10481     Hints.emitRemarkWithHints();
10482     return false;
10483   }
10484 
10485   bool AllowOrderedReductions;
10486   // If the flag is set, use that instead and override the TTI behaviour.
10487   if (ForceOrderedReductions.getNumOccurrences() > 0)
10488     AllowOrderedReductions = ForceOrderedReductions;
10489   else
10490     AllowOrderedReductions = TTI->enableOrderedReductions();
10491   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10492     ORE->emit([&]() {
10493       auto *ExactFPMathInst = Requirements.getExactFPInst();
10494       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10495                                                  ExactFPMathInst->getDebugLoc(),
10496                                                  ExactFPMathInst->getParent())
10497              << "loop not vectorized: cannot prove it is safe to reorder "
10498                 "floating-point operations";
10499     });
10500     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10501                          "reorder floating-point operations\n");
10502     Hints.emitRemarkWithHints();
10503     return false;
10504   }
10505 
10506   // Use the cost model.
10507   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10508                                 F, &Hints, IAI);
10509   // Use the planner for vectorization.
10510   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10511                                ORE);
10512 
10513   // Get user vectorization factor and interleave count.
10514   ElementCount UserVF = Hints.getWidth();
10515   unsigned UserIC = Hints.getInterleave();
10516 
10517   // Plan how to best vectorize.
10518   LVP.plan(UserVF, UserIC);
10519   VectorizationFactor VF = LVP.computeBestVF();
10520   unsigned IC = 1;
10521 
10522   if (ORE->allowExtraAnalysis(LV_NAME))
10523     LVP.emitInvalidCostRemarks(ORE);
10524 
10525   bool AddBranchWeights =
10526       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10527   GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10528                            AddBranchWeights, CM.CostKind);
10529   if (LVP.hasPlanWithVF(VF.Width)) {
10530     // Select the interleave count.
10531     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10532 
10533     unsigned SelectedIC = std::max(IC, UserIC);
10534     //  Optimistically generate runtime checks if they are needed. Drop them if
10535     //  they turn out to not be profitable.
10536     if (VF.Width.isVector() || SelectedIC > 1)
10537       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10538 
10539     // Check if it is profitable to vectorize with runtime checks.
10540     bool ForceVectorization =
10541         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10542     if (!ForceVectorization &&
10543         !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10544       ORE->emit([&]() {
10545         return OptimizationRemarkAnalysisAliasing(
10546                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10547                    L->getHeader())
10548                << "loop not vectorized: cannot prove it is safe to reorder "
10549                   "memory operations";
10550       });
10551       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10552       Hints.emitRemarkWithHints();
10553       return false;
10554     }
10555   }
10556 
10557   // Identify the diagnostic messages that should be produced.
10558   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10559   bool VectorizeLoop = true, InterleaveLoop = true;
10560   if (VF.Width.isScalar()) {
10561     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10562     VecDiagMsg = std::make_pair(
10563         "VectorizationNotBeneficial",
10564         "the cost-model indicates that vectorization is not beneficial");
10565     VectorizeLoop = false;
10566   }
10567 
10568   if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10569     // Tell the user interleaving was avoided up-front, despite being explicitly
10570     // requested.
10571     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10572                          "interleaving should be avoided up front\n");
10573     IntDiagMsg = std::make_pair(
10574         "InterleavingAvoided",
10575         "Ignoring UserIC, because interleaving was avoided up front");
10576     InterleaveLoop = false;
10577   } else if (IC == 1 && UserIC <= 1) {
10578     // Tell the user interleaving is not beneficial.
10579     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10580     IntDiagMsg = std::make_pair(
10581         "InterleavingNotBeneficial",
10582         "the cost-model indicates that interleaving is not beneficial");
10583     InterleaveLoop = false;
10584     if (UserIC == 1) {
10585       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10586       IntDiagMsg.second +=
10587           " and is explicitly disabled or interleave count is set to 1";
10588     }
10589   } else if (IC > 1 && UserIC == 1) {
10590     // Tell the user interleaving is beneficial, but it explicitly disabled.
10591     LLVM_DEBUG(
10592         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10593     IntDiagMsg = std::make_pair(
10594         "InterleavingBeneficialButDisabled",
10595         "the cost-model indicates that interleaving is beneficial "
10596         "but is explicitly disabled or interleave count is set to 1");
10597     InterleaveLoop = false;
10598   }
10599 
10600   // If there is a histogram in the loop, do not just interleave without
10601   // vectorizing. The order of operations will be incorrect without the
10602   // histogram intrinsics, which are only used for recipes with VF > 1.
10603   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10604     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10605                       << "to histogram operations.\n");
10606     IntDiagMsg = std::make_pair(
10607         "HistogramPreventsScalarInterleaving",
10608         "Unable to interleave without vectorization due to constraints on "
10609         "the order of histogram operations");
10610     InterleaveLoop = false;
10611   }
10612 
10613   // Override IC if user provided an interleave count.
10614   IC = UserIC > 0 ? UserIC : IC;
10615 
10616   // Emit diagnostic messages, if any.
10617   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10618   if (!VectorizeLoop && !InterleaveLoop) {
10619     // Do not vectorize or interleaving the loop.
10620     ORE->emit([&]() {
10621       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10622                                       L->getStartLoc(), L->getHeader())
10623              << VecDiagMsg.second;
10624     });
10625     ORE->emit([&]() {
10626       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10627                                       L->getStartLoc(), L->getHeader())
10628              << IntDiagMsg.second;
10629     });
10630     return false;
10631   }
10632 
10633   if (!VectorizeLoop && InterleaveLoop) {
10634     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10635     ORE->emit([&]() {
10636       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10637                                         L->getStartLoc(), L->getHeader())
10638              << VecDiagMsg.second;
10639     });
10640   } else if (VectorizeLoop && !InterleaveLoop) {
10641     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10642                       << ") in " << L->getLocStr() << '\n');
10643     ORE->emit([&]() {
10644       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10645                                         L->getStartLoc(), L->getHeader())
10646              << IntDiagMsg.second;
10647     });
10648   } else if (VectorizeLoop && InterleaveLoop) {
10649     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10650                       << ") in " << L->getLocStr() << '\n');
10651     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10652   }
10653 
10654   bool DisableRuntimeUnroll = false;
10655   MDNode *OrigLoopID = L->getLoopID();
10656   {
10657     using namespace ore;
10658     if (!VectorizeLoop) {
10659       assert(IC > 1 && "interleave count should not be 1 or 0");
10660       // If we decided that it is not legal to vectorize the loop, then
10661       // interleave it.
10662       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10663       InnerLoopVectorizer Unroller(
10664           L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10665           ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10666 
10667       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10668 
10669       ORE->emit([&]() {
10670         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10671                                   L->getHeader())
10672                << "interleaved loop (interleaved count: "
10673                << NV("InterleaveCount", IC) << ")";
10674       });
10675     } else {
10676       // If we decided that it is *legal* to vectorize the loop, then do it.
10677 
10678       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10679       // Consider vectorizing the epilogue too if it's profitable.
10680       VectorizationFactor EpilogueVF =
10681           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10682       if (EpilogueVF.Width.isVector()) {
10683         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10684 
10685         // The first pass vectorizes the main loop and creates a scalar epilogue
10686         // to be vectorized by executing the plan (potentially with a different
10687         // factor) again shortly afterwards.
10688         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10689         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10690         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10691                                           BestEpiPlan);
10692         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10693                                            EPI, &LVL, &CM, BFI, PSI, Checks,
10694                                            *BestMainPlan);
10695         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10696                                              *BestMainPlan, MainILV, DT, false);
10697         ++LoopsVectorized;
10698 
10699         // Second pass vectorizes the epilogue and adjusts the control flow
10700         // edges from the first pass.
10701         EPI.MainLoopVF = EPI.EpilogueVF;
10702         EPI.MainLoopUF = EPI.EpilogueUF;
10703         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10704                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10705                                                  Checks, BestEpiPlan);
10706         EpilogILV.setTripCount(MainILV.getTripCount());
10707         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10708 
10709         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10710                         DT, true, &ExpandedSCEVs);
10711         ++LoopsEpilogueVectorized;
10712 
10713         if (!MainILV.areSafetyChecksAdded())
10714           DisableRuntimeUnroll = true;
10715       } else {
10716         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10717                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10718                                PSI, Checks, BestPlan);
10719         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10720         ++LoopsVectorized;
10721 
10722         // Add metadata to disable runtime unrolling a scalar loop when there
10723         // are no runtime checks about strides and memory. A scalar loop that is
10724         // rarely used is not worth unrolling.
10725         if (!LB.areSafetyChecksAdded())
10726           DisableRuntimeUnroll = true;
10727       }
10728       // Report the vectorization decision.
10729       reportVectorization(ORE, L, VF, IC);
10730     }
10731 
10732     if (ORE->allowExtraAnalysis(LV_NAME))
10733       checkMixedPrecision(L, ORE);
10734   }
10735 
10736   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10737          "DT not preserved correctly");
10738 
10739   std::optional<MDNode *> RemainderLoopID =
10740       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10741                                       LLVMLoopVectorizeFollowupEpilogue});
10742   if (RemainderLoopID) {
10743     L->setLoopID(*RemainderLoopID);
10744   } else {
10745     if (DisableRuntimeUnroll)
10746       addRuntimeUnrollDisableMetaData(L);
10747 
10748     // Mark the loop as already vectorized to avoid vectorizing again.
10749     Hints.setAlreadyVectorized();
10750   }
10751 
10752   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10753   return true;
10754 }
10755 
10756 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10757 
10758   // Don't attempt if
10759   // 1. the target claims to have no vector registers, and
10760   // 2. interleaving won't help ILP.
10761   //
10762   // The second condition is necessary because, even if the target has no
10763   // vector registers, loop vectorization may still enable scalar
10764   // interleaving.
10765   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10766       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10767     return LoopVectorizeResult(false, false);
10768 
10769   bool Changed = false, CFGChanged = false;
10770 
10771   // The vectorizer requires loops to be in simplified form.
10772   // Since simplification may add new inner loops, it has to run before the
10773   // legality and profitability checks. This means running the loop vectorizer
10774   // will simplify all loops, regardless of whether anything end up being
10775   // vectorized.
10776   for (const auto &L : *LI)
10777     Changed |= CFGChanged |=
10778         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10779 
10780   // Build up a worklist of inner-loops to vectorize. This is necessary as
10781   // the act of vectorizing or partially unrolling a loop creates new loops
10782   // and can invalidate iterators across the loops.
10783   SmallVector<Loop *, 8> Worklist;
10784 
10785   for (Loop *L : *LI)
10786     collectSupportedLoops(*L, LI, ORE, Worklist);
10787 
10788   LoopsAnalyzed += Worklist.size();
10789 
10790   // Now walk the identified inner loops.
10791   while (!Worklist.empty()) {
10792     Loop *L = Worklist.pop_back_val();
10793 
10794     // For the inner loops we actually process, form LCSSA to simplify the
10795     // transform.
10796     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10797 
10798     Changed |= CFGChanged |= processLoop(L);
10799 
10800     if (Changed) {
10801       LAIs->clear();
10802 
10803 #ifndef NDEBUG
10804       if (VerifySCEV)
10805         SE->verify();
10806 #endif
10807     }
10808   }
10809 
10810   // Process each loop nest in the function.
10811   return LoopVectorizeResult(Changed, CFGChanged);
10812 }
10813 
10814 PreservedAnalyses LoopVectorizePass::run(Function &F,
10815                                          FunctionAnalysisManager &AM) {
10816   LI = &AM.getResult<LoopAnalysis>(F);
10817   // There are no loops in the function. Return before computing other
10818   // expensive analyses.
10819   if (LI->empty())
10820     return PreservedAnalyses::all();
10821   SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
10822   TTI = &AM.getResult<TargetIRAnalysis>(F);
10823   DT = &AM.getResult<DominatorTreeAnalysis>(F);
10824   TLI = &AM.getResult<TargetLibraryAnalysis>(F);
10825   AC = &AM.getResult<AssumptionAnalysis>(F);
10826   DB = &AM.getResult<DemandedBitsAnalysis>(F);
10827   ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10828   LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10829 
10830   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10831   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10832   BFI = nullptr;
10833   if (PSI && PSI->hasProfileSummary())
10834     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10835   LoopVectorizeResult Result = runImpl(F);
10836   if (!Result.MadeAnyChange)
10837     return PreservedAnalyses::all();
10838   PreservedAnalyses PA;
10839 
10840   if (isAssignmentTrackingEnabled(*F.getParent())) {
10841     for (auto &BB : F)
10842       RemoveRedundantDbgInstrs(&BB);
10843   }
10844 
10845   PA.preserve<LoopAnalysis>();
10846   PA.preserve<DominatorTreeAnalysis>();
10847   PA.preserve<ScalarEvolutionAnalysis>();
10848   PA.preserve<LoopAccessAnalysis>();
10849 
10850   if (Result.MadeCFGChange) {
10851     // Making CFG changes likely means a loop got vectorized. Indicate that
10852     // extra simplification passes should be run.
10853     // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10854     // be run if runtime checks have been added.
10855     AM.getResult<ShouldRunExtraVectorPasses>(F);
10856     PA.preserve<ShouldRunExtraVectorPasses>();
10857   } else {
10858     PA.preserveSet<CFGAnalyses>();
10859   }
10860   return PA;
10861 }
10862 
10863 void LoopVectorizePass::printPipeline(
10864     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10865   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10866       OS, MapClassName2PassName);
10867 
10868   OS << '<';
10869   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10870   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10871   OS << '>';
10872 }
10873