xref: /llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 1de3dc7d23dd6b856efad3a3a04f2396328726d7)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanPatternMatch.h"
63 #include "VPlanTransforms.h"
64 #include "VPlanUtils.h"
65 #include "VPlanVerifier.h"
66 #include "llvm/ADT/APInt.h"
67 #include "llvm/ADT/ArrayRef.h"
68 #include "llvm/ADT/DenseMap.h"
69 #include "llvm/ADT/DenseMapInfo.h"
70 #include "llvm/ADT/Hashing.h"
71 #include "llvm/ADT/MapVector.h"
72 #include "llvm/ADT/STLExtras.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/TypeSwitch.h"
79 #include "llvm/ADT/iterator_range.h"
80 #include "llvm/Analysis/AssumptionCache.h"
81 #include "llvm/Analysis/BasicAliasAnalysis.h"
82 #include "llvm/Analysis/BlockFrequencyInfo.h"
83 #include "llvm/Analysis/CFG.h"
84 #include "llvm/Analysis/CodeMetrics.h"
85 #include "llvm/Analysis/DemandedBits.h"
86 #include "llvm/Analysis/GlobalsModRef.h"
87 #include "llvm/Analysis/LoopAccessAnalysis.h"
88 #include "llvm/Analysis/LoopAnalysisManager.h"
89 #include "llvm/Analysis/LoopInfo.h"
90 #include "llvm/Analysis/LoopIterator.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/ValueTracking.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfo.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/MDBuilder.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/PatternMatch.h"
122 #include "llvm/IR/ProfDataUtils.h"
123 #include "llvm/IR/Type.h"
124 #include "llvm/IR/Use.h"
125 #include "llvm/IR/User.h"
126 #include "llvm/IR/Value.h"
127 #include "llvm/IR/Verifier.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/NativeFormatting.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/Local.h"
139 #include "llvm/Transforms/Utils/LoopSimplify.h"
140 #include "llvm/Transforms/Utils/LoopUtils.h"
141 #include "llvm/Transforms/Utils/LoopVersioning.h"
142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143 #include "llvm/Transforms/Utils/SizeOpts.h"
144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <algorithm>
146 #include <cassert>
147 #include <cstdint>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251                    "data-and-control-without-rt-check",
252                    "Similar to data-and-control, but remove the runtime check"),
253         clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254                    "Use predicated EVL instructions for tail folding. If EVL "
255                    "is unsupported, fallback to data-without-lane-mask.")));
256 
257 static cl::opt<bool> MaximizeBandwidth(
258     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259     cl::desc("Maximize bandwidth when selecting vectorization factor which "
260              "will be determined by the smallest type in loop."));
261 
262 static cl::opt<bool> EnableInterleavedMemAccesses(
263     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265 
266 /// An interleave-group may need masking if it resides in a block that needs
267 /// predication, or in order to mask away gaps.
268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
269     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271 
272 static cl::opt<unsigned> ForceTargetNumScalarRegs(
273     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's number of scalar registers."));
275 
276 static cl::opt<unsigned> ForceTargetNumVectorRegs(
277     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278     cl::desc("A flag that overrides the target's number of vector registers."));
279 
280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
281     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282     cl::desc("A flag that overrides the target's max interleave factor for "
283              "scalar loops."));
284 
285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
286     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287     cl::desc("A flag that overrides the target's max interleave factor for "
288              "vectorized loops."));
289 
290 cl::opt<unsigned> ForceTargetInstructionCost(
291     "force-target-instruction-cost", cl::init(0), cl::Hidden,
292     cl::desc("A flag that overrides the target's expected cost for "
293              "an instruction to a single constant value. Mostly "
294              "useful for getting consistent testing."));
295 
296 static cl::opt<bool> ForceTargetSupportsScalableVectors(
297     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298     cl::desc(
299         "Pretend that scalable vectors are supported, even if the target does "
300         "not support them. This flag should only be used for testing."));
301 
302 static cl::opt<unsigned> SmallLoopCost(
303     "small-loop-cost", cl::init(20), cl::Hidden,
304     cl::desc(
305         "The cost of a loop that is considered 'small' by the interleaver."));
306 
307 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
308     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309     cl::desc("Enable the use of the block frequency analysis to access PGO "
310              "heuristics minimizing code growth in cold regions and being more "
311              "aggressive in hot regions."));
312 
313 // Runtime interleave loops for load/store throughput.
314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
315     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316     cl::desc(
317         "Enable runtime interleaving until load/store ports are saturated"));
318 
319 /// The number of stores in a loop that are allowed to need predication.
320 static cl::opt<unsigned> NumberOfStoresToPredicate(
321     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322     cl::desc("Max number of stores to be predicated behind an if."));
323 
324 static cl::opt<bool> EnableIndVarRegisterHeur(
325     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326     cl::desc("Count the induction variable only once when interleaving"));
327 
328 static cl::opt<bool> EnableCondStoresVectorization(
329     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330     cl::desc("Enable if predication of stores during vectorization."));
331 
332 static cl::opt<unsigned> MaxNestedScalarReductionIC(
333     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334     cl::desc("The maximum interleave count to use when interleaving a scalar "
335              "reduction in a nested loop."));
336 
337 static cl::opt<bool>
338     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
339                            cl::Hidden,
340                            cl::desc("Prefer in-loop vector reductions, "
341                                     "overriding the targets preference."));
342 
343 static cl::opt<bool> ForceOrderedReductions(
344     "force-ordered-reductions", cl::init(false), cl::Hidden,
345     cl::desc("Enable the vectorisation of loops with in-order (strict) "
346              "FP reductions"));
347 
348 static cl::opt<bool> PreferPredicatedReductionSelect(
349     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350     cl::desc(
351         "Prefer predicating a reduction operation over an after loop select."));
352 
353 namespace llvm {
354 cl::opt<bool> EnableVPlanNativePath(
355     "enable-vplan-native-path", cl::Hidden,
356     cl::desc("Enable VPlan-native vectorization path with "
357              "support for outer loop vectorization."));
358 } // namespace llvm
359 
360 // This flag enables the stress testing of the VPlan H-CFG construction in the
361 // VPlan-native vectorization path. It must be used in conjuction with
362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363 // verification of the H-CFGs built.
364 static cl::opt<bool> VPlanBuildStressTest(
365     "vplan-build-stress-test", cl::init(false), cl::Hidden,
366     cl::desc(
367         "Build VPlan for every supported loop nest in the function and bail "
368         "out right after the build (stress test the VPlan H-CFG construction "
369         "in the VPlan-native vectorization path)."));
370 
371 cl::opt<bool> llvm::EnableLoopInterleaving(
372     "interleave-loops", cl::init(true), cl::Hidden,
373     cl::desc("Enable loop interleaving in Loop vectorization passes"));
374 cl::opt<bool> llvm::EnableLoopVectorization(
375     "vectorize-loops", cl::init(true), cl::Hidden,
376     cl::desc("Run the Loop vectorization passes"));
377 
378 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
379     "force-widen-divrem-via-safe-divisor", cl::Hidden,
380     cl::desc(
381         "Override cost based safe divisor widening for div/rem instructions"));
382 
383 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
384     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
385     cl::Hidden,
386     cl::desc("Try wider VFs if they enable the use of vector variants"));
387 
388 static cl::opt<bool> EnableEarlyExitVectorization(
389     "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390     cl::desc(
391         "Enable vectorization of early exit loops with uncountable exits."));
392 
393 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394 // variables not overflowing do not hold. See `emitSCEVChecks`.
395 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
397 // `emitMemRuntimeChecks`.
398 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399 // Likelyhood of bypassing the vectorized loop because there are zero trips left
400 // after prolog. See `emitIterationCountCheck`.
401 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402 
403 /// A helper function that returns true if the given type is irregular. The
404 /// type is irregular if its allocated size doesn't equal the store size of an
405 /// element of the corresponding vector type.
406 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407   // Determine if an array of N elements of type Ty is "bitcast compatible"
408   // with a <N x Ty> vector.
409   // This is only true if there is no padding between the array elements.
410   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411 }
412 
413 /// Returns "best known" trip count for the specified loop \p L as defined by
414 /// the following procedure:
415 ///   1) Returns exact trip count if it is known.
416 ///   2) Returns expected trip count according to profile data if any.
417 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
418 ///   4) Returns std::nullopt if all of the above failed.
419 static std::optional<unsigned>
420 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
421                     bool CanUseConstantMax = true) {
422   // Check if exact trip count is known.
423   if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
424     return ExpectedTC;
425 
426   // Check if there is an expected trip count available from profile data.
427   if (LoopVectorizeWithBlockFrequency)
428     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
429       return *EstimatedTC;
430 
431   if (!CanUseConstantMax)
432     return std::nullopt;
433 
434   // Check if upper bound estimate is known.
435   if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
436     return ExpectedTC;
437 
438   return std::nullopt;
439 }
440 
441 namespace {
442 // Forward declare GeneratedRTChecks.
443 class GeneratedRTChecks;
444 
445 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446 } // namespace
447 
448 namespace llvm {
449 
450 AnalysisKey ShouldRunExtraVectorPasses::Key;
451 
452 /// InnerLoopVectorizer vectorizes loops which contain only one basic
453 /// block to a specified vectorization factor (VF).
454 /// This class performs the widening of scalars into vectors, or multiple
455 /// scalars. This class also implements the following features:
456 /// * It inserts an epilogue loop for handling loops that don't have iteration
457 ///   counts that are known to be a multiple of the vectorization factor.
458 /// * It handles the code generation for reduction variables.
459 /// * Scalarization (implementation using scalars) of un-vectorizable
460 ///   instructions.
461 /// InnerLoopVectorizer does not perform any vectorization-legality
462 /// checks, and relies on the caller to check for the different legality
463 /// aspects. The InnerLoopVectorizer relies on the
464 /// LoopVectorizationLegality class to provide information about the induction
465 /// and reduction variables that were found to a given vectorization factor.
466 class InnerLoopVectorizer {
467 public:
468   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
469                       LoopInfo *LI, DominatorTree *DT,
470                       const TargetLibraryInfo *TLI,
471                       const TargetTransformInfo *TTI, AssumptionCache *AC,
472                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
473                       ElementCount MinProfitableTripCount,
474                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
475                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
476                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
477                       VPlan &Plan)
478       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
479         AC(AC), ORE(ORE), VF(VecWidth),
480         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
481         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
482         PSI(PSI), RTChecks(RTChecks), Plan(Plan),
483         VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
484     // Query this against the original loop and save it here because the profile
485     // of the original loop header may change as the transformation happens.
486     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
487         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
488   }
489 
490   virtual ~InnerLoopVectorizer() = default;
491 
492   /// Create a new empty loop that will contain vectorized instructions later
493   /// on, while the old loop will be used as the scalar remainder. Control flow
494   /// is generated around the vectorized (and scalar epilogue) loops consisting
495   /// of various checks and bypasses. Return the pre-header block of the new
496   /// loop. In the case of epilogue vectorization, this function is overriden to
497   /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
498   /// used to look up SCEV expansions for expressions needed during skeleton
499   /// creation.
500   virtual BasicBlock *
501   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502 
503   /// Fix the vectorized code, taking care of header phi's, and more.
504   void fixVectorizedLoop(VPTransformState &State);
505 
506   // Return true if any runtime check is added.
507   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
508 
509   /// A helper function to scalarize a single Instruction in the innermost loop.
510   /// Generates a sequence of scalar instances for each lane between \p MinLane
511   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
512   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
513   /// Instr's operands.
514   void scalarizeInstruction(const Instruction *Instr,
515                             VPReplicateRecipe *RepRecipe, const VPLane &Lane,
516                             VPTransformState &State);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPTransformState &State);
520 
521   /// Returns the original loop trip count.
522   Value *getTripCount() const { return TripCount; }
523 
524   /// Used to set the trip count after ILV's construction and after the
525   /// preheader block has been executed. Note that this always holds the trip
526   /// count of the original loop for both main loop and epilogue vectorization.
527   void setTripCount(Value *TC) { TripCount = TC; }
528 
529   // Retrieve the additional bypass value associated with an original
530   /// induction header phi.
531   Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
532     return Induction2AdditionalBypassValue.at(OrigPhi);
533   }
534 
535   /// Return the additional bypass block which targets the scalar loop by
536   /// skipping the epilogue loop after completing the main loop.
537   BasicBlock *getAdditionalBypassBlock() const {
538     assert(AdditionalBypassBlock &&
539            "Trying to access AdditionalBypassBlock but it has not been set");
540     return AdditionalBypassBlock;
541   }
542 
543 protected:
544   friend class LoopVectorizationPlanner;
545 
546   /// Set up the values of the IVs correctly when exiting the vector loop.
547   virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
548                             Value *VectorTripCount, BasicBlock *MiddleBlock,
549                             VPTransformState &State);
550 
551   /// Iteratively sink the scalarized operands of a predicated instruction into
552   /// the block that was created for it.
553   void sinkScalarOperands(Instruction *PredInst);
554 
555   /// Returns (and creates if needed) the trip count of the widened loop.
556   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
557 
558   /// Emit a bypass check to see if the vector trip count is zero, including if
559   /// it overflows.
560   void emitIterationCountCheck(BasicBlock *Bypass);
561 
562   /// Emit a bypass check to see if all of the SCEV assumptions we've
563   /// had to make are correct. Returns the block containing the checks or
564   /// nullptr if no checks have been added.
565   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
566 
567   /// Emit bypass checks to check any memory assumptions we may have made.
568   /// Returns the block containing the checks or nullptr if no checks have been
569   /// added.
570   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
571 
572   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
573   /// vector loop preheader, middle block and scalar preheader.
574   void createVectorLoopSkeleton(StringRef Prefix);
575 
576   /// Create and record the values for induction variables to resume coming from
577   /// the additional bypass block.
578   void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
579                                              Value *MainVectorTripCount);
580 
581   /// Allow subclasses to override and print debug traces before/after vplan
582   /// execution, when trace information is requested.
583   virtual void printDebugTracesAtStart() {}
584   virtual void printDebugTracesAtEnd() {}
585 
586   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
587   /// vector preheader and its predecessor, also connecting the new block to the
588   /// scalar preheader.
589   void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
590 
591   /// The original loop.
592   Loop *OrigLoop;
593 
594   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
595   /// dynamic knowledge to simplify SCEV expressions and converts them to a
596   /// more usable form.
597   PredicatedScalarEvolution &PSE;
598 
599   /// Loop Info.
600   LoopInfo *LI;
601 
602   /// Dominator Tree.
603   DominatorTree *DT;
604 
605   /// Target Library Info.
606   const TargetLibraryInfo *TLI;
607 
608   /// Target Transform Info.
609   const TargetTransformInfo *TTI;
610 
611   /// Assumption Cache.
612   AssumptionCache *AC;
613 
614   /// Interface to emit optimization remarks.
615   OptimizationRemarkEmitter *ORE;
616 
617   /// The vectorization SIMD factor to use. Each vector will have this many
618   /// vector elements.
619   ElementCount VF;
620 
621   ElementCount MinProfitableTripCount;
622 
623   /// The vectorization unroll factor to use. Each scalar is vectorized to this
624   /// many different vector instructions.
625   unsigned UF;
626 
627   /// The builder that we use
628   IRBuilder<> Builder;
629 
630   // --- Vectorization state ---
631 
632   /// The vector-loop preheader.
633   BasicBlock *LoopVectorPreHeader;
634 
635   /// The scalar-loop preheader.
636   BasicBlock *LoopScalarPreHeader;
637 
638   /// Middle Block between the vector and the scalar.
639   BasicBlock *LoopMiddleBlock;
640 
641   /// A list of all bypass blocks. The first block is the entry of the loop.
642   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
643 
644   /// Store instructions that were predicated.
645   SmallVector<Instruction *, 4> PredicatedInstructions;
646 
647   /// Trip count of the original loop.
648   Value *TripCount = nullptr;
649 
650   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
651   Value *VectorTripCount = nullptr;
652 
653   /// The legality analysis.
654   LoopVectorizationLegality *Legal;
655 
656   /// The profitablity analysis.
657   LoopVectorizationCostModel *Cost;
658 
659   // Record whether runtime checks are added.
660   bool AddedSafetyChecks = false;
661 
662   /// BFI and PSI are used to check for profile guided size optimizations.
663   BlockFrequencyInfo *BFI;
664   ProfileSummaryInfo *PSI;
665 
666   // Whether this loop should be optimized for size based on profile guided size
667   // optimizatios.
668   bool OptForSizeBasedOnProfile;
669 
670   /// Structure to hold information about generated runtime checks, responsible
671   /// for cleaning the checks, if vectorization turns out unprofitable.
672   GeneratedRTChecks &RTChecks;
673 
674   /// Mapping of induction phis to their additional bypass values. They
675   /// need to be added as operands to phi nodes in the scalar loop preheader
676   /// after the epilogue skeleton has been created.
677   DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
678 
679   /// The additional bypass block which conditionally skips over the epilogue
680   /// loop after executing the main loop. Needed to resume inductions and
681   /// reductions during epilogue vectorization.
682   BasicBlock *AdditionalBypassBlock = nullptr;
683 
684   VPlan &Plan;
685 
686   /// The vector preheader block of \p Plan, used as target for check blocks
687   /// introduced during skeleton creation.
688   VPBlockBase *VectorPHVPB;
689 };
690 
691 /// Encapsulate information regarding vectorization of a loop and its epilogue.
692 /// This information is meant to be updated and used across two stages of
693 /// epilogue vectorization.
694 struct EpilogueLoopVectorizationInfo {
695   ElementCount MainLoopVF = ElementCount::getFixed(0);
696   unsigned MainLoopUF = 0;
697   ElementCount EpilogueVF = ElementCount::getFixed(0);
698   unsigned EpilogueUF = 0;
699   BasicBlock *MainLoopIterationCountCheck = nullptr;
700   BasicBlock *EpilogueIterationCountCheck = nullptr;
701   BasicBlock *SCEVSafetyCheck = nullptr;
702   BasicBlock *MemSafetyCheck = nullptr;
703   Value *TripCount = nullptr;
704   Value *VectorTripCount = nullptr;
705   VPlan &EpiloguePlan;
706 
707   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
708                                 ElementCount EVF, unsigned EUF,
709                                 VPlan &EpiloguePlan)
710       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
711         EpiloguePlan(EpiloguePlan) {
712     assert(EUF == 1 &&
713            "A high UF for the epilogue loop is likely not beneficial.");
714   }
715 };
716 
717 /// An extension of the inner loop vectorizer that creates a skeleton for a
718 /// vectorized loop that has its epilogue (residual) also vectorized.
719 /// The idea is to run the vplan on a given loop twice, firstly to setup the
720 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
721 /// from the first step and vectorize the epilogue.  This is achieved by
722 /// deriving two concrete strategy classes from this base class and invoking
723 /// them in succession from the loop vectorizer planner.
724 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
725 public:
726   InnerLoopAndEpilogueVectorizer(
727       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
728       DominatorTree *DT, const TargetLibraryInfo *TLI,
729       const TargetTransformInfo *TTI, AssumptionCache *AC,
730       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
731       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
732       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
733       GeneratedRTChecks &Checks, VPlan &Plan)
734       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
735                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
736                             CM, BFI, PSI, Checks, Plan),
737         EPI(EPI) {}
738 
739   // Override this function to handle the more complex control flow around the
740   // three loops.
741   BasicBlock *
742   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
743     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
744   }
745 
746   /// The interface for creating a vectorized skeleton using one of two
747   /// different strategies, each corresponding to one execution of the vplan
748   /// as described above.
749   virtual BasicBlock *
750   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
751 
752   /// Holds and updates state information required to vectorize the main loop
753   /// and its epilogue in two separate passes. This setup helps us avoid
754   /// regenerating and recomputing runtime safety checks. It also helps us to
755   /// shorten the iteration-count-check path length for the cases where the
756   /// iteration count of the loop is so small that the main vector loop is
757   /// completely skipped.
758   EpilogueLoopVectorizationInfo &EPI;
759 };
760 
761 /// A specialized derived class of inner loop vectorizer that performs
762 /// vectorization of *main* loops in the process of vectorizing loops and their
763 /// epilogues.
764 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
765 public:
766   EpilogueVectorizerMainLoop(
767       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
768       DominatorTree *DT, const TargetLibraryInfo *TLI,
769       const TargetTransformInfo *TTI, AssumptionCache *AC,
770       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
771       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
772       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
773       GeneratedRTChecks &Check, VPlan &Plan)
774       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
775                                        EPI, LVL, CM, BFI, PSI, Check, Plan) {}
776   /// Implements the interface for creating a vectorized skeleton using the
777   /// *main loop* strategy (ie the first pass of vplan execution).
778   BasicBlock *
779   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
780 
781 protected:
782   /// Emits an iteration count bypass check once for the main loop (when \p
783   /// ForEpilogue is false) and once for the epilogue loop (when \p
784   /// ForEpilogue is true).
785   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
786   void printDebugTracesAtStart() override;
787   void printDebugTracesAtEnd() override;
788 
789   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
790                     Value *VectorTripCount, BasicBlock *MiddleBlock,
791                     VPTransformState &State) override {};
792 };
793 
794 // A specialized derived class of inner loop vectorizer that performs
795 // vectorization of *epilogue* loops in the process of vectorizing loops and
796 // their epilogues.
797 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
798 public:
799   EpilogueVectorizerEpilogueLoop(
800       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
801       DominatorTree *DT, const TargetLibraryInfo *TLI,
802       const TargetTransformInfo *TTI, AssumptionCache *AC,
803       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
804       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
805       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
806       GeneratedRTChecks &Checks, VPlan &Plan)
807       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
808                                        EPI, LVL, CM, BFI, PSI, Checks, Plan) {
809     TripCount = EPI.TripCount;
810   }
811   /// Implements the interface for creating a vectorized skeleton using the
812   /// *epilogue loop* strategy (ie the second pass of vplan execution).
813   BasicBlock *
814   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
815 
816 protected:
817   /// Emits an iteration count bypass check after the main vector loop has
818   /// finished to see if there are any iterations left to execute by either
819   /// the vector epilogue or the scalar epilogue.
820   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
821                                                       BasicBlock *Bypass,
822                                                       BasicBlock *Insert);
823   void printDebugTracesAtStart() override;
824   void printDebugTracesAtEnd() override;
825 };
826 } // end namespace llvm
827 
828 /// Look for a meaningful debug location on the instruction or its operands.
829 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
830   if (!I)
831     return DebugLoc();
832 
833   DebugLoc Empty;
834   if (I->getDebugLoc() != Empty)
835     return I->getDebugLoc();
836 
837   for (Use &Op : I->operands()) {
838     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
839       if (OpInst->getDebugLoc() != Empty)
840         return OpInst->getDebugLoc();
841   }
842 
843   return I->getDebugLoc();
844 }
845 
846 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
847 /// is passed, the message relates to that particular instruction.
848 #ifndef NDEBUG
849 static void debugVectorizationMessage(const StringRef Prefix,
850                                       const StringRef DebugMsg,
851                                       Instruction *I) {
852   dbgs() << "LV: " << Prefix << DebugMsg;
853   if (I != nullptr)
854     dbgs() << " " << *I;
855   else
856     dbgs() << '.';
857   dbgs() << '\n';
858 }
859 #endif
860 
861 /// Create an analysis remark that explains why vectorization failed
862 ///
863 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
864 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
865 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
866 /// the location of the remark. If \p DL is passed, use it as debug location for
867 /// the remark. \return the remark object that can be streamed to.
868 static OptimizationRemarkAnalysis
869 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
870                  Instruction *I, DebugLoc DL = {}) {
871   Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
872   // If debug location is attached to the instruction, use it. Otherwise if DL
873   // was not provided, use the loop's.
874   if (I && I->getDebugLoc())
875     DL = I->getDebugLoc();
876   else if (!DL)
877     DL = TheLoop->getStartLoc();
878 
879   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
880 }
881 
882 namespace llvm {
883 
884 /// Return a value for Step multiplied by VF.
885 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
886                        int64_t Step) {
887   assert(Ty->isIntegerTy() && "Expected an integer step");
888   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
889 }
890 
891 /// Return the runtime value for VF.
892 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
893   return B.CreateElementCount(Ty, VF);
894 }
895 
896 void reportVectorizationFailure(const StringRef DebugMsg,
897                                 const StringRef OREMsg, const StringRef ORETag,
898                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
899                                 Instruction *I) {
900   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
901   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
902   ORE->emit(
903       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
904       << "loop not vectorized: " << OREMsg);
905 }
906 
907 /// Reports an informative message: print \p Msg for debugging purposes as well
908 /// as an optimization remark. Uses either \p I as location of the remark, or
909 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
910 /// remark. If \p DL is passed, use it as debug location for the remark.
911 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
912                                     OptimizationRemarkEmitter *ORE,
913                                     Loop *TheLoop, Instruction *I = nullptr,
914                                     DebugLoc DL = {}) {
915   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
916   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
917   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
918                              I, DL)
919             << Msg);
920 }
921 
922 /// Report successful vectorization of the loop. In case an outer loop is
923 /// vectorized, prepend "outer" to the vectorization remark.
924 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
925                                 VectorizationFactor VF, unsigned IC) {
926   LLVM_DEBUG(debugVectorizationMessage(
927       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
928       nullptr));
929   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
930   ORE->emit([&]() {
931     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
932                               TheLoop->getHeader())
933            << "vectorized " << LoopType << "loop (vectorization width: "
934            << ore::NV("VectorizationFactor", VF.Width)
935            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
936   });
937 }
938 
939 } // end namespace llvm
940 
941 namespace llvm {
942 
943 // Loop vectorization cost-model hints how the scalar epilogue loop should be
944 // lowered.
945 enum ScalarEpilogueLowering {
946 
947   // The default: allowing scalar epilogues.
948   CM_ScalarEpilogueAllowed,
949 
950   // Vectorization with OptForSize: don't allow epilogues.
951   CM_ScalarEpilogueNotAllowedOptSize,
952 
953   // A special case of vectorisation with OptForSize: loops with a very small
954   // trip count are considered for vectorization under OptForSize, thereby
955   // making sure the cost of their loop body is dominant, free of runtime
956   // guards and scalar iteration overheads.
957   CM_ScalarEpilogueNotAllowedLowTripLoop,
958 
959   // Loop hint predicate indicating an epilogue is undesired.
960   CM_ScalarEpilogueNotNeededUsePredicate,
961 
962   // Directive indicating we must either tail fold or not vectorize
963   CM_ScalarEpilogueNotAllowedUsePredicate
964 };
965 
966 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
967 
968 /// LoopVectorizationCostModel - estimates the expected speedups due to
969 /// vectorization.
970 /// In many cases vectorization is not profitable. This can happen because of
971 /// a number of reasons. In this class we mainly attempt to predict the
972 /// expected speedup/slowdowns due to the supported instruction set. We use the
973 /// TargetTransformInfo to query the different backends for the cost of
974 /// different operations.
975 class LoopVectorizationCostModel {
976   friend class LoopVectorizationPlanner;
977 
978 public:
979   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
980                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
981                              LoopVectorizationLegality *Legal,
982                              const TargetTransformInfo &TTI,
983                              const TargetLibraryInfo *TLI, DemandedBits *DB,
984                              AssumptionCache *AC,
985                              OptimizationRemarkEmitter *ORE, const Function *F,
986                              const LoopVectorizeHints *Hints,
987                              InterleavedAccessInfo &IAI)
988       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
989         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
990         Hints(Hints), InterleaveInfo(IAI) {}
991 
992   /// \return An upper bound for the vectorization factors (both fixed and
993   /// scalable). If the factors are 0, vectorization and interleaving should be
994   /// avoided up front.
995   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
996 
997   /// \return True if runtime checks are required for vectorization, and false
998   /// otherwise.
999   bool runtimeChecksRequired();
1000 
1001   /// Setup cost-based decisions for user vectorization factor.
1002   /// \return true if the UserVF is a feasible VF to be chosen.
1003   bool selectUserVectorizationFactor(ElementCount UserVF) {
1004     collectUniformsAndScalars(UserVF);
1005     collectInstsToScalarize(UserVF);
1006     return expectedCost(UserVF).isValid();
1007   }
1008 
1009   /// \return The size (in bits) of the smallest and widest types in the code
1010   /// that needs to be vectorized. We ignore values that remain scalar such as
1011   /// 64 bit loop indices.
1012   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1013 
1014   /// \return The desired interleave count.
1015   /// If interleave count has been specified by metadata it will be returned.
1016   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1017   /// are the selected vectorization factor and the cost of the selected VF.
1018   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1019 
1020   /// Memory access instruction may be vectorized in more than one way.
1021   /// Form of instruction after vectorization depends on cost.
1022   /// This function takes cost-based decisions for Load/Store instructions
1023   /// and collects them in a map. This decisions map is used for building
1024   /// the lists of loop-uniform and loop-scalar instructions.
1025   /// The calculated cost is saved with widening decision in order to
1026   /// avoid redundant calculations.
1027   void setCostBasedWideningDecision(ElementCount VF);
1028 
1029   /// A call may be vectorized in different ways depending on whether we have
1030   /// vectorized variants available and whether the target supports masking.
1031   /// This function analyzes all calls in the function at the supplied VF,
1032   /// makes a decision based on the costs of available options, and stores that
1033   /// decision in a map for use in planning and plan execution.
1034   void setVectorizedCallDecision(ElementCount VF);
1035 
1036   /// A struct that represents some properties of the register usage
1037   /// of a loop.
1038   struct RegisterUsage {
1039     /// Holds the number of loop invariant values that are used in the loop.
1040     /// The key is ClassID of target-provided register class.
1041     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1042     /// Holds the maximum number of concurrent live intervals in the loop.
1043     /// The key is ClassID of target-provided register class.
1044     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1045   };
1046 
1047   /// \return Returns information about the register usages of the loop for the
1048   /// given vectorization factors.
1049   SmallVector<RegisterUsage, 8>
1050   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1051 
1052   /// Collect values we want to ignore in the cost model.
1053   void collectValuesToIgnore();
1054 
1055   /// Collect all element types in the loop for which widening is needed.
1056   void collectElementTypesForWidening();
1057 
1058   /// Split reductions into those that happen in the loop, and those that happen
1059   /// outside. In loop reductions are collected into InLoopReductions.
1060   void collectInLoopReductions();
1061 
1062   /// Returns true if we should use strict in-order reductions for the given
1063   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1064   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1065   /// of FP operations.
1066   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1067     return !Hints->allowReordering() && RdxDesc.isOrdered();
1068   }
1069 
1070   /// \returns The smallest bitwidth each instruction can be represented with.
1071   /// The vector equivalents of these instructions should be truncated to this
1072   /// type.
1073   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1074     return MinBWs;
1075   }
1076 
1077   /// \returns True if it is more profitable to scalarize instruction \p I for
1078   /// vectorization factor \p VF.
1079   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1080     assert(VF.isVector() &&
1081            "Profitable to scalarize relevant only for VF > 1.");
1082     assert(
1083         TheLoop->isInnermost() &&
1084         "cost-model should not be used for outer loops (in VPlan-native path)");
1085 
1086     auto Scalars = InstsToScalarize.find(VF);
1087     assert(Scalars != InstsToScalarize.end() &&
1088            "VF not yet analyzed for scalarization profitability");
1089     return Scalars->second.contains(I);
1090   }
1091 
1092   /// Returns true if \p I is known to be uniform after vectorization.
1093   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1094     assert(
1095         TheLoop->isInnermost() &&
1096         "cost-model should not be used for outer loops (in VPlan-native path)");
1097     // Pseudo probe needs to be duplicated for each unrolled iteration and
1098     // vector lane so that profiled loop trip count can be accurately
1099     // accumulated instead of being under counted.
1100     if (isa<PseudoProbeInst>(I))
1101       return false;
1102 
1103     if (VF.isScalar())
1104       return true;
1105 
1106     auto UniformsPerVF = Uniforms.find(VF);
1107     assert(UniformsPerVF != Uniforms.end() &&
1108            "VF not yet analyzed for uniformity");
1109     return UniformsPerVF->second.count(I);
1110   }
1111 
1112   /// Returns true if \p I is known to be scalar after vectorization.
1113   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1114     assert(
1115         TheLoop->isInnermost() &&
1116         "cost-model should not be used for outer loops (in VPlan-native path)");
1117     if (VF.isScalar())
1118       return true;
1119 
1120     auto ScalarsPerVF = Scalars.find(VF);
1121     assert(ScalarsPerVF != Scalars.end() &&
1122            "Scalar values are not calculated for VF");
1123     return ScalarsPerVF->second.count(I);
1124   }
1125 
1126   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1127   /// for vectorization factor \p VF.
1128   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1129     return VF.isVector() && MinBWs.contains(I) &&
1130            !isProfitableToScalarize(I, VF) &&
1131            !isScalarAfterVectorization(I, VF);
1132   }
1133 
1134   /// Decision that was taken during cost calculation for memory instruction.
1135   enum InstWidening {
1136     CM_Unknown,
1137     CM_Widen,         // For consecutive accesses with stride +1.
1138     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1139     CM_Interleave,
1140     CM_GatherScatter,
1141     CM_Scalarize,
1142     CM_VectorCall,
1143     CM_IntrinsicCall
1144   };
1145 
1146   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1147   /// instruction \p I and vector width \p VF.
1148   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1149                            InstructionCost Cost) {
1150     assert(VF.isVector() && "Expected VF >=2");
1151     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1152   }
1153 
1154   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1155   /// interleaving group \p Grp and vector width \p VF.
1156   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1157                            ElementCount VF, InstWidening W,
1158                            InstructionCost Cost) {
1159     assert(VF.isVector() && "Expected VF >=2");
1160     /// Broadcast this decicion to all instructions inside the group.
1161     /// When interleaving, the cost will only be assigned one instruction, the
1162     /// insert position. For other cases, add the appropriate fraction of the
1163     /// total cost to each instruction. This ensures accurate costs are used,
1164     /// even if the insert position instruction is not used.
1165     InstructionCost InsertPosCost = Cost;
1166     InstructionCost OtherMemberCost = 0;
1167     if (W != CM_Interleave)
1168       OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1169     ;
1170     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1171       if (auto *I = Grp->getMember(Idx)) {
1172         if (Grp->getInsertPos() == I)
1173           WideningDecisions[std::make_pair(I, VF)] =
1174               std::make_pair(W, InsertPosCost);
1175         else
1176           WideningDecisions[std::make_pair(I, VF)] =
1177               std::make_pair(W, OtherMemberCost);
1178       }
1179     }
1180   }
1181 
1182   /// Return the cost model decision for the given instruction \p I and vector
1183   /// width \p VF. Return CM_Unknown if this instruction did not pass
1184   /// through the cost modeling.
1185   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1186     assert(VF.isVector() && "Expected VF to be a vector VF");
1187     assert(
1188         TheLoop->isInnermost() &&
1189         "cost-model should not be used for outer loops (in VPlan-native path)");
1190 
1191     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1192     auto Itr = WideningDecisions.find(InstOnVF);
1193     if (Itr == WideningDecisions.end())
1194       return CM_Unknown;
1195     return Itr->second.first;
1196   }
1197 
1198   /// Return the vectorization cost for the given instruction \p I and vector
1199   /// width \p VF.
1200   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1201     assert(VF.isVector() && "Expected VF >=2");
1202     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1203     assert(WideningDecisions.contains(InstOnVF) &&
1204            "The cost is not calculated");
1205     return WideningDecisions[InstOnVF].second;
1206   }
1207 
1208   struct CallWideningDecision {
1209     InstWidening Kind;
1210     Function *Variant;
1211     Intrinsic::ID IID;
1212     std::optional<unsigned> MaskPos;
1213     InstructionCost Cost;
1214   };
1215 
1216   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1217                                Function *Variant, Intrinsic::ID IID,
1218                                std::optional<unsigned> MaskPos,
1219                                InstructionCost Cost) {
1220     assert(!VF.isScalar() && "Expected vector VF");
1221     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1222                                                      MaskPos, Cost};
1223   }
1224 
1225   CallWideningDecision getCallWideningDecision(CallInst *CI,
1226                                                ElementCount VF) const {
1227     assert(!VF.isScalar() && "Expected vector VF");
1228     return CallWideningDecisions.at(std::make_pair(CI, VF));
1229   }
1230 
1231   /// Return True if instruction \p I is an optimizable truncate whose operand
1232   /// is an induction variable. Such a truncate will be removed by adding a new
1233   /// induction variable with the destination type.
1234   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1235     // If the instruction is not a truncate, return false.
1236     auto *Trunc = dyn_cast<TruncInst>(I);
1237     if (!Trunc)
1238       return false;
1239 
1240     // Get the source and destination types of the truncate.
1241     Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1242     Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1243 
1244     // If the truncate is free for the given types, return false. Replacing a
1245     // free truncate with an induction variable would add an induction variable
1246     // update instruction to each iteration of the loop. We exclude from this
1247     // check the primary induction variable since it will need an update
1248     // instruction regardless.
1249     Value *Op = Trunc->getOperand(0);
1250     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1251       return false;
1252 
1253     // If the truncated value is not an induction variable, return false.
1254     return Legal->isInductionPhi(Op);
1255   }
1256 
1257   /// Collects the instructions to scalarize for each predicated instruction in
1258   /// the loop.
1259   void collectInstsToScalarize(ElementCount VF);
1260 
1261   /// Collect Uniform and Scalar values for the given \p VF.
1262   /// The sets depend on CM decision for Load/Store instructions
1263   /// that may be vectorized as interleave, gather-scatter or scalarized.
1264   /// Also make a decision on what to do about call instructions in the loop
1265   /// at that VF -- scalarize, call a known vector routine, or call a
1266   /// vector intrinsic.
1267   void collectUniformsAndScalars(ElementCount VF) {
1268     // Do the analysis once.
1269     if (VF.isScalar() || Uniforms.contains(VF))
1270       return;
1271     setCostBasedWideningDecision(VF);
1272     collectLoopUniforms(VF);
1273     setVectorizedCallDecision(VF);
1274     collectLoopScalars(VF);
1275   }
1276 
1277   /// Returns true if the target machine supports masked store operation
1278   /// for the given \p DataType and kind of access to \p Ptr.
1279   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1280     return Legal->isConsecutivePtr(DataType, Ptr) &&
1281            TTI.isLegalMaskedStore(DataType, Alignment);
1282   }
1283 
1284   /// Returns true if the target machine supports masked load operation
1285   /// for the given \p DataType and kind of access to \p Ptr.
1286   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1287     return Legal->isConsecutivePtr(DataType, Ptr) &&
1288            TTI.isLegalMaskedLoad(DataType, Alignment);
1289   }
1290 
1291   /// Returns true if the target machine can represent \p V as a masked gather
1292   /// or scatter operation.
1293   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1294     bool LI = isa<LoadInst>(V);
1295     bool SI = isa<StoreInst>(V);
1296     if (!LI && !SI)
1297       return false;
1298     auto *Ty = getLoadStoreType(V);
1299     Align Align = getLoadStoreAlignment(V);
1300     if (VF.isVector())
1301       Ty = VectorType::get(Ty, VF);
1302     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1303            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1304   }
1305 
1306   /// Returns true if the target machine supports all of the reduction
1307   /// variables found for the given VF.
1308   bool canVectorizeReductions(ElementCount VF) const {
1309     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1310       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1311       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1312     }));
1313   }
1314 
1315   /// Given costs for both strategies, return true if the scalar predication
1316   /// lowering should be used for div/rem.  This incorporates an override
1317   /// option so it is not simply a cost comparison.
1318   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1319                                      InstructionCost SafeDivisorCost) const {
1320     switch (ForceSafeDivisor) {
1321     case cl::BOU_UNSET:
1322       return ScalarCost < SafeDivisorCost;
1323     case cl::BOU_TRUE:
1324       return false;
1325     case cl::BOU_FALSE:
1326       return true;
1327     }
1328     llvm_unreachable("impossible case value");
1329   }
1330 
1331   /// Returns true if \p I is an instruction which requires predication and
1332   /// for which our chosen predication strategy is scalarization (i.e. we
1333   /// don't have an alternate strategy such as masking available).
1334   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1335   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1336 
1337   /// Returns true if \p I is an instruction that needs to be predicated
1338   /// at runtime.  The result is independent of the predication mechanism.
1339   /// Superset of instructions that return true for isScalarWithPredication.
1340   bool isPredicatedInst(Instruction *I) const;
1341 
1342   /// Return the costs for our two available strategies for lowering a
1343   /// div/rem operation which requires speculating at least one lane.
1344   /// First result is for scalarization (will be invalid for scalable
1345   /// vectors); second is for the safe-divisor strategy.
1346   std::pair<InstructionCost, InstructionCost>
1347   getDivRemSpeculationCost(Instruction *I,
1348                            ElementCount VF) const;
1349 
1350   /// Returns true if \p I is a memory instruction with consecutive memory
1351   /// access that can be widened.
1352   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1353 
1354   /// Returns true if \p I is a memory instruction in an interleaved-group
1355   /// of memory accesses that can be vectorized with wide vector loads/stores
1356   /// and shuffles.
1357   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1358 
1359   /// Check if \p Instr belongs to any interleaved access group.
1360   bool isAccessInterleaved(Instruction *Instr) const {
1361     return InterleaveInfo.isInterleaved(Instr);
1362   }
1363 
1364   /// Get the interleaved access group that \p Instr belongs to.
1365   const InterleaveGroup<Instruction> *
1366   getInterleavedAccessGroup(Instruction *Instr) const {
1367     return InterleaveInfo.getInterleaveGroup(Instr);
1368   }
1369 
1370   /// Returns true if we're required to use a scalar epilogue for at least
1371   /// the final iteration of the original loop.
1372   bool requiresScalarEpilogue(bool IsVectorizing) const {
1373     if (!isScalarEpilogueAllowed()) {
1374       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1375       return false;
1376     }
1377     // If we might exit from anywhere but the latch and early exit vectorization
1378     // is disabled, we must run the exiting iteration in scalar form.
1379     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1380         !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1381       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1382                            "from latch block\n");
1383       return true;
1384     }
1385     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1386       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1387                            "interleaved group requires scalar epilogue\n");
1388       return true;
1389     }
1390     LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1391     return false;
1392   }
1393 
1394   /// Returns true if we're required to use a scalar epilogue for at least
1395   /// the final iteration of the original loop for all VFs in \p Range.
1396   /// A scalar epilogue must either be required for all VFs in \p Range or for
1397   /// none.
1398   bool requiresScalarEpilogue(VFRange Range) const {
1399     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1400       return requiresScalarEpilogue(VF.isVector());
1401     };
1402     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1403     assert(
1404         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1405         "all VFs in range must agree on whether a scalar epilogue is required");
1406     return IsRequired;
1407   }
1408 
1409   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1410   /// loop hint annotation.
1411   bool isScalarEpilogueAllowed() const {
1412     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1413   }
1414 
1415   /// Returns the TailFoldingStyle that is best for the current loop.
1416   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1417     if (!ChosenTailFoldingStyle)
1418       return TailFoldingStyle::None;
1419     return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1420                                : ChosenTailFoldingStyle->second;
1421   }
1422 
1423   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1424   /// overflow or not.
1425   /// \param IsScalableVF true if scalable vector factors enabled.
1426   /// \param UserIC User specific interleave count.
1427   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1428     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1429     if (!Legal->canFoldTailByMasking()) {
1430       ChosenTailFoldingStyle =
1431           std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1432       return;
1433     }
1434 
1435     if (!ForceTailFoldingStyle.getNumOccurrences()) {
1436       ChosenTailFoldingStyle = std::make_pair(
1437           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1438           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1439       return;
1440     }
1441 
1442     // Set styles when forced.
1443     ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1444                                             ForceTailFoldingStyle.getValue());
1445     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1446       return;
1447     // Override forced styles if needed.
1448     // FIXME: use actual opcode/data type for analysis here.
1449     // FIXME: Investigate opportunity for fixed vector factor.
1450     bool EVLIsLegal = UserIC <= 1 &&
1451                       TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1452                       !EnableVPlanNativePath;
1453     if (!EVLIsLegal) {
1454       // If for some reason EVL mode is unsupported, fallback to
1455       // DataWithoutLaneMask to try to vectorize the loop with folded tail
1456       // in a generic way.
1457       ChosenTailFoldingStyle =
1458           std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1459                          TailFoldingStyle::DataWithoutLaneMask);
1460       LLVM_DEBUG(
1461           dbgs()
1462           << "LV: Preference for VP intrinsics indicated. Will "
1463              "not try to generate VP Intrinsics "
1464           << (UserIC > 1
1465                   ? "since interleave count specified is greater than 1.\n"
1466                   : "due to non-interleaving reasons.\n"));
1467     }
1468   }
1469 
1470   /// Returns true if all loop blocks should be masked to fold tail loop.
1471   bool foldTailByMasking() const {
1472     // TODO: check if it is possible to check for None style independent of
1473     // IVUpdateMayOverflow flag in getTailFoldingStyle.
1474     return getTailFoldingStyle() != TailFoldingStyle::None;
1475   }
1476 
1477   /// Return maximum safe number of elements to be processed per vector
1478   /// iteration, which do not prevent store-load forwarding and are safe with
1479   /// regard to the memory dependencies. Required for EVL-based VPlans to
1480   /// correctly calculate AVL (application vector length) as min(remaining AVL,
1481   /// MaxSafeElements).
1482   /// TODO: need to consider adjusting cost model to use this value as a
1483   /// vectorization factor for EVL-based vectorization.
1484   std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1485 
1486   /// Returns true if the instructions in this block requires predication
1487   /// for any reason, e.g. because tail folding now requires a predicate
1488   /// or because the block in the original loop was predicated.
1489   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1490     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1491   }
1492 
1493   /// Returns true if VP intrinsics with explicit vector length support should
1494   /// be generated in the tail folded loop.
1495   bool foldTailWithEVL() const {
1496     return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1497   }
1498 
1499   /// Returns true if the Phi is part of an inloop reduction.
1500   bool isInLoopReduction(PHINode *Phi) const {
1501     return InLoopReductions.contains(Phi);
1502   }
1503 
1504   /// Returns true if the predicated reduction select should be used to set the
1505   /// incoming value for the reduction phi.
1506   bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1507     // Force to use predicated reduction select since the EVL of the
1508     // second-to-last iteration might not be VF*UF.
1509     if (foldTailWithEVL())
1510       return true;
1511     return PreferPredicatedReductionSelect ||
1512            TTI.preferPredicatedReductionSelect(
1513                Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1514   }
1515 
1516   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1517   /// with factor VF.  Return the cost of the instruction, including
1518   /// scalarization overhead if it's needed.
1519   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1520 
1521   /// Estimate cost of a call instruction CI if it were vectorized with factor
1522   /// VF. Return the cost of the instruction, including scalarization overhead
1523   /// if it's needed.
1524   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1525 
1526   /// Invalidates decisions already taken by the cost model.
1527   void invalidateCostModelingDecisions() {
1528     WideningDecisions.clear();
1529     CallWideningDecisions.clear();
1530     Uniforms.clear();
1531     Scalars.clear();
1532   }
1533 
1534   /// Returns the expected execution cost. The unit of the cost does
1535   /// not matter because we use the 'cost' units to compare different
1536   /// vector widths. The cost that is returned is *not* normalized by
1537   /// the factor width.
1538   InstructionCost expectedCost(ElementCount VF);
1539 
1540   bool hasPredStores() const { return NumPredStores > 0; }
1541 
1542   /// Returns true if epilogue vectorization is considered profitable, and
1543   /// false otherwise.
1544   /// \p VF is the vectorization factor chosen for the original loop.
1545   /// \p Multiplier is an aditional scaling factor applied to VF before
1546   /// comparing to EpilogueVectorizationMinVF.
1547   bool isEpilogueVectorizationProfitable(const ElementCount VF,
1548                                          const unsigned IC) const;
1549 
1550   /// Returns the execution time cost of an instruction for a given vector
1551   /// width. Vector width of one means scalar.
1552   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1553 
1554   /// Return the cost of instructions in an inloop reduction pattern, if I is
1555   /// part of that pattern.
1556   std::optional<InstructionCost>
1557   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1558                           TTI::TargetCostKind CostKind) const;
1559 
1560   /// Returns true if \p Op should be considered invariant and if it is
1561   /// trivially hoistable.
1562   bool shouldConsiderInvariant(Value *Op);
1563 
1564 private:
1565   unsigned NumPredStores = 0;
1566 
1567   /// \return An upper bound for the vectorization factors for both
1568   /// fixed and scalable vectorization, where the minimum-known number of
1569   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1570   /// disabled or unsupported, then the scalable part will be equal to
1571   /// ElementCount::getScalable(0).
1572   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1573                                            ElementCount UserVF,
1574                                            bool FoldTailByMasking);
1575 
1576   /// \return the maximized element count based on the targets vector
1577   /// registers and the loop trip-count, but limited to a maximum safe VF.
1578   /// This is a helper function of computeFeasibleMaxVF.
1579   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1580                                        unsigned SmallestType,
1581                                        unsigned WidestType,
1582                                        ElementCount MaxSafeVF,
1583                                        bool FoldTailByMasking);
1584 
1585   /// Checks if scalable vectorization is supported and enabled. Caches the
1586   /// result to avoid repeated debug dumps for repeated queries.
1587   bool isScalableVectorizationAllowed();
1588 
1589   /// \return the maximum legal scalable VF, based on the safe max number
1590   /// of elements.
1591   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1592 
1593   /// Calculate vectorization cost of memory instruction \p I.
1594   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1595 
1596   /// The cost computation for scalarized memory instruction.
1597   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1598 
1599   /// The cost computation for interleaving group of memory instructions.
1600   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1601 
1602   /// The cost computation for Gather/Scatter instruction.
1603   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1604 
1605   /// The cost computation for widening instruction \p I with consecutive
1606   /// memory access.
1607   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1608 
1609   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1610   /// Load: scalar load + broadcast.
1611   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1612   /// element)
1613   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1614 
1615   /// Estimate the overhead of scalarizing an instruction. This is a
1616   /// convenience wrapper for the type-based getScalarizationOverhead API.
1617   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1618                                            TTI::TargetCostKind CostKind) const;
1619 
1620   /// Returns true if an artificially high cost for emulated masked memrefs
1621   /// should be used.
1622   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1623 
1624   /// Map of scalar integer values to the smallest bitwidth they can be legally
1625   /// represented as. The vector equivalents of these values should be truncated
1626   /// to this type.
1627   MapVector<Instruction *, uint64_t> MinBWs;
1628 
1629   /// A type representing the costs for instructions if they were to be
1630   /// scalarized rather than vectorized. The entries are Instruction-Cost
1631   /// pairs.
1632   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1633 
1634   /// A set containing all BasicBlocks that are known to present after
1635   /// vectorization as a predicated block.
1636   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1637       PredicatedBBsAfterVectorization;
1638 
1639   /// Records whether it is allowed to have the original scalar loop execute at
1640   /// least once. This may be needed as a fallback loop in case runtime
1641   /// aliasing/dependence checks fail, or to handle the tail/remainder
1642   /// iterations when the trip count is unknown or doesn't divide by the VF,
1643   /// or as a peel-loop to handle gaps in interleave-groups.
1644   /// Under optsize and when the trip count is very small we don't allow any
1645   /// iterations to execute in the scalar loop.
1646   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1647 
1648   /// Control finally chosen tail folding style. The first element is used if
1649   /// the IV update may overflow, the second element - if it does not.
1650   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1651       ChosenTailFoldingStyle;
1652 
1653   /// true if scalable vectorization is supported and enabled.
1654   std::optional<bool> IsScalableVectorizationAllowed;
1655 
1656   /// Maximum safe number of elements to be processed per vector iteration,
1657   /// which do not prevent store-load forwarding and are safe with regard to the
1658   /// memory dependencies. Required for EVL-based veectorization, where this
1659   /// value is used as the upper bound of the safe AVL.
1660   std::optional<unsigned> MaxSafeElements;
1661 
1662   /// A map holding scalar costs for different vectorization factors. The
1663   /// presence of a cost for an instruction in the mapping indicates that the
1664   /// instruction will be scalarized when vectorizing with the associated
1665   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1666   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1667 
1668   /// Holds the instructions known to be uniform after vectorization.
1669   /// The data is collected per VF.
1670   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1671 
1672   /// Holds the instructions known to be scalar after vectorization.
1673   /// The data is collected per VF.
1674   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1675 
1676   /// Holds the instructions (address computations) that are forced to be
1677   /// scalarized.
1678   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1679 
1680   /// PHINodes of the reductions that should be expanded in-loop.
1681   SmallPtrSet<PHINode *, 4> InLoopReductions;
1682 
1683   /// A Map of inloop reduction operations and their immediate chain operand.
1684   /// FIXME: This can be removed once reductions can be costed correctly in
1685   /// VPlan. This was added to allow quick lookup of the inloop operations.
1686   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1687 
1688   /// Returns the expected difference in cost from scalarizing the expression
1689   /// feeding a predicated instruction \p PredInst. The instructions to
1690   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1691   /// non-negative return value implies the expression will be scalarized.
1692   /// Currently, only single-use chains are considered for scalarization.
1693   InstructionCost computePredInstDiscount(Instruction *PredInst,
1694                                           ScalarCostsTy &ScalarCosts,
1695                                           ElementCount VF);
1696 
1697   /// Collect the instructions that are uniform after vectorization. An
1698   /// instruction is uniform if we represent it with a single scalar value in
1699   /// the vectorized loop corresponding to each vector iteration. Examples of
1700   /// uniform instructions include pointer operands of consecutive or
1701   /// interleaved memory accesses. Note that although uniformity implies an
1702   /// instruction will be scalar, the reverse is not true. In general, a
1703   /// scalarized instruction will be represented by VF scalar values in the
1704   /// vectorized loop, each corresponding to an iteration of the original
1705   /// scalar loop.
1706   void collectLoopUniforms(ElementCount VF);
1707 
1708   /// Collect the instructions that are scalar after vectorization. An
1709   /// instruction is scalar if it is known to be uniform or will be scalarized
1710   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1711   /// to the list if they are used by a load/store instruction that is marked as
1712   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1713   /// VF values in the vectorized loop, each corresponding to an iteration of
1714   /// the original scalar loop.
1715   void collectLoopScalars(ElementCount VF);
1716 
1717   /// Keeps cost model vectorization decision and cost for instructions.
1718   /// Right now it is used for memory instructions only.
1719   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1720                                 std::pair<InstWidening, InstructionCost>>;
1721 
1722   DecisionList WideningDecisions;
1723 
1724   using CallDecisionList =
1725       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1726 
1727   CallDecisionList CallWideningDecisions;
1728 
1729   /// Returns true if \p V is expected to be vectorized and it needs to be
1730   /// extracted.
1731   bool needsExtract(Value *V, ElementCount VF) const {
1732     Instruction *I = dyn_cast<Instruction>(V);
1733     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1734         TheLoop->isLoopInvariant(I) ||
1735         getWideningDecision(I, VF) == CM_Scalarize)
1736       return false;
1737 
1738     // Assume we can vectorize V (and hence we need extraction) if the
1739     // scalars are not computed yet. This can happen, because it is called
1740     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1741     // the scalars are collected. That should be a safe assumption in most
1742     // cases, because we check if the operands have vectorizable types
1743     // beforehand in LoopVectorizationLegality.
1744     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1745   };
1746 
1747   /// Returns a range containing only operands needing to be extracted.
1748   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1749                                                    ElementCount VF) const {
1750     return SmallVector<Value *, 4>(make_filter_range(
1751         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1752   }
1753 
1754 public:
1755   /// The loop that we evaluate.
1756   Loop *TheLoop;
1757 
1758   /// Predicated scalar evolution analysis.
1759   PredicatedScalarEvolution &PSE;
1760 
1761   /// Loop Info analysis.
1762   LoopInfo *LI;
1763 
1764   /// Vectorization legality.
1765   LoopVectorizationLegality *Legal;
1766 
1767   /// Vector target information.
1768   const TargetTransformInfo &TTI;
1769 
1770   /// Target Library Info.
1771   const TargetLibraryInfo *TLI;
1772 
1773   /// Demanded bits analysis.
1774   DemandedBits *DB;
1775 
1776   /// Assumption cache.
1777   AssumptionCache *AC;
1778 
1779   /// Interface to emit optimization remarks.
1780   OptimizationRemarkEmitter *ORE;
1781 
1782   const Function *TheFunction;
1783 
1784   /// Loop Vectorize Hint.
1785   const LoopVectorizeHints *Hints;
1786 
1787   /// The interleave access information contains groups of interleaved accesses
1788   /// with the same stride and close to each other.
1789   InterleavedAccessInfo &InterleaveInfo;
1790 
1791   /// Values to ignore in the cost model.
1792   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1793 
1794   /// Values to ignore in the cost model when VF > 1.
1795   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1796 
1797   /// All element types found in the loop.
1798   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1799 };
1800 } // end namespace llvm
1801 
1802 namespace {
1803 /// Helper struct to manage generating runtime checks for vectorization.
1804 ///
1805 /// The runtime checks are created up-front in temporary blocks to allow better
1806 /// estimating the cost and un-linked from the existing IR. After deciding to
1807 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1808 /// temporary blocks are completely removed.
1809 class GeneratedRTChecks {
1810   /// Basic block which contains the generated SCEV checks, if any.
1811   BasicBlock *SCEVCheckBlock = nullptr;
1812 
1813   /// The value representing the result of the generated SCEV checks. If it is
1814   /// nullptr, either no SCEV checks have been generated or they have been used.
1815   Value *SCEVCheckCond = nullptr;
1816 
1817   /// Basic block which contains the generated memory runtime checks, if any.
1818   BasicBlock *MemCheckBlock = nullptr;
1819 
1820   /// The value representing the result of the generated memory runtime checks.
1821   /// If it is nullptr, either no memory runtime checks have been generated or
1822   /// they have been used.
1823   Value *MemRuntimeCheckCond = nullptr;
1824 
1825   DominatorTree *DT;
1826   LoopInfo *LI;
1827   TargetTransformInfo *TTI;
1828 
1829   SCEVExpander SCEVExp;
1830   SCEVExpander MemCheckExp;
1831 
1832   bool CostTooHigh = false;
1833   const bool AddBranchWeights;
1834 
1835   Loop *OuterLoop = nullptr;
1836 
1837   PredicatedScalarEvolution &PSE;
1838 
1839 public:
1840   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1841                     LoopInfo *LI, TargetTransformInfo *TTI,
1842                     const DataLayout &DL, bool AddBranchWeights)
1843       : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1844         MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1845         AddBranchWeights(AddBranchWeights), PSE(PSE) {}
1846 
1847   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1848   /// accurately estimate the cost of the runtime checks. The blocks are
1849   /// un-linked from the IR and are added back during vector code generation. If
1850   /// there is no vector code generation, the check blocks are removed
1851   /// completely.
1852   void create(Loop *L, const LoopAccessInfo &LAI,
1853               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1854 
1855     // Hard cutoff to limit compile-time increase in case a very large number of
1856     // runtime checks needs to be generated.
1857     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1858     // profile info.
1859     CostTooHigh =
1860         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1861     if (CostTooHigh)
1862       return;
1863 
1864     BasicBlock *LoopHeader = L->getHeader();
1865     BasicBlock *Preheader = L->getLoopPreheader();
1866 
1867     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1868     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1869     // may be used by SCEVExpander. The blocks will be un-linked from their
1870     // predecessors and removed from LI & DT at the end of the function.
1871     if (!UnionPred.isAlwaysTrue()) {
1872       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1873                                   nullptr, "vector.scevcheck");
1874 
1875       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1876           &UnionPred, SCEVCheckBlock->getTerminator());
1877     }
1878 
1879     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1880     if (RtPtrChecking.Need) {
1881       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1882       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1883                                  "vector.memcheck");
1884 
1885       auto DiffChecks = RtPtrChecking.getDiffChecks();
1886       if (DiffChecks) {
1887         Value *RuntimeVF = nullptr;
1888         MemRuntimeCheckCond = addDiffRuntimeChecks(
1889             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1890             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1891               if (!RuntimeVF)
1892                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1893               return RuntimeVF;
1894             },
1895             IC);
1896       } else {
1897         MemRuntimeCheckCond = addRuntimeChecks(
1898             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1899             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1900       }
1901       assert(MemRuntimeCheckCond &&
1902              "no RT checks generated although RtPtrChecking "
1903              "claimed checks are required");
1904     }
1905 
1906     if (!MemCheckBlock && !SCEVCheckBlock)
1907       return;
1908 
1909     // Unhook the temporary block with the checks, update various places
1910     // accordingly.
1911     if (SCEVCheckBlock)
1912       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1913     if (MemCheckBlock)
1914       MemCheckBlock->replaceAllUsesWith(Preheader);
1915 
1916     if (SCEVCheckBlock) {
1917       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1918       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1919       Preheader->getTerminator()->eraseFromParent();
1920     }
1921     if (MemCheckBlock) {
1922       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1923       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1924       Preheader->getTerminator()->eraseFromParent();
1925     }
1926 
1927     DT->changeImmediateDominator(LoopHeader, Preheader);
1928     if (MemCheckBlock) {
1929       DT->eraseNode(MemCheckBlock);
1930       LI->removeBlock(MemCheckBlock);
1931     }
1932     if (SCEVCheckBlock) {
1933       DT->eraseNode(SCEVCheckBlock);
1934       LI->removeBlock(SCEVCheckBlock);
1935     }
1936 
1937     // Outer loop is used as part of the later cost calculations.
1938     OuterLoop = L->getParentLoop();
1939   }
1940 
1941   InstructionCost getCost() {
1942     if (SCEVCheckBlock || MemCheckBlock)
1943       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1944 
1945     if (CostTooHigh) {
1946       InstructionCost Cost;
1947       Cost.setInvalid();
1948       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1949       return Cost;
1950     }
1951 
1952     InstructionCost RTCheckCost = 0;
1953     if (SCEVCheckBlock)
1954       for (Instruction &I : *SCEVCheckBlock) {
1955         if (SCEVCheckBlock->getTerminator() == &I)
1956           continue;
1957         InstructionCost C =
1958             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1959         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1960         RTCheckCost += C;
1961       }
1962     if (MemCheckBlock) {
1963       InstructionCost MemCheckCost = 0;
1964       for (Instruction &I : *MemCheckBlock) {
1965         if (MemCheckBlock->getTerminator() == &I)
1966           continue;
1967         InstructionCost C =
1968             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1969         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1970         MemCheckCost += C;
1971       }
1972 
1973       // If the runtime memory checks are being created inside an outer loop
1974       // we should find out if these checks are outer loop invariant. If so,
1975       // the checks will likely be hoisted out and so the effective cost will
1976       // reduce according to the outer loop trip count.
1977       if (OuterLoop) {
1978         ScalarEvolution *SE = MemCheckExp.getSE();
1979         // TODO: If profitable, we could refine this further by analysing every
1980         // individual memory check, since there could be a mixture of loop
1981         // variant and invariant checks that mean the final condition is
1982         // variant.
1983         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1984         if (SE->isLoopInvariant(Cond, OuterLoop)) {
1985           // It seems reasonable to assume that we can reduce the effective
1986           // cost of the checks even when we know nothing about the trip
1987           // count. Assume that the outer loop executes at least twice.
1988           unsigned BestTripCount = 2;
1989 
1990           // Get the best known TC estimate.
1991           if (auto EstimatedTC = getSmallBestKnownTC(
1992                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
1993             BestTripCount = *EstimatedTC;
1994 
1995           BestTripCount = std::max(BestTripCount, 1U);
1996           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1997 
1998           // Let's ensure the cost is always at least 1.
1999           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2000                                      (InstructionCost::CostType)1);
2001 
2002           if (BestTripCount > 1)
2003             LLVM_DEBUG(dbgs()
2004                        << "We expect runtime memory checks to be hoisted "
2005                        << "out of the outer loop. Cost reduced from "
2006                        << MemCheckCost << " to " << NewMemCheckCost << '\n');
2007 
2008           MemCheckCost = NewMemCheckCost;
2009         }
2010       }
2011 
2012       RTCheckCost += MemCheckCost;
2013     }
2014 
2015     if (SCEVCheckBlock || MemCheckBlock)
2016       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2017                         << "\n");
2018 
2019     return RTCheckCost;
2020   }
2021 
2022   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2023   /// unused.
2024   ~GeneratedRTChecks() {
2025     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2026     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2027     if (!SCEVCheckCond)
2028       SCEVCleaner.markResultUsed();
2029 
2030     if (!MemRuntimeCheckCond)
2031       MemCheckCleaner.markResultUsed();
2032 
2033     if (MemRuntimeCheckCond) {
2034       auto &SE = *MemCheckExp.getSE();
2035       // Memory runtime check generation creates compares that use expanded
2036       // values. Remove them before running the SCEVExpanderCleaners.
2037       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2038         if (MemCheckExp.isInsertedInstruction(&I))
2039           continue;
2040         SE.forgetValue(&I);
2041         I.eraseFromParent();
2042       }
2043     }
2044     MemCheckCleaner.cleanup();
2045     SCEVCleaner.cleanup();
2046 
2047     if (SCEVCheckCond)
2048       SCEVCheckBlock->eraseFromParent();
2049     if (MemRuntimeCheckCond)
2050       MemCheckBlock->eraseFromParent();
2051   }
2052 
2053   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2054   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2055   /// depending on the generated condition.
2056   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2057                              BasicBlock *LoopVectorPreHeader) {
2058     if (!SCEVCheckCond)
2059       return nullptr;
2060 
2061     Value *Cond = SCEVCheckCond;
2062     // Mark the check as used, to prevent it from being removed during cleanup.
2063     SCEVCheckCond = nullptr;
2064     if (auto *C = dyn_cast<ConstantInt>(Cond))
2065       if (C->isZero())
2066         return nullptr;
2067 
2068     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2069 
2070     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2071     // Create new preheader for vector loop.
2072     if (OuterLoop)
2073       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2074 
2075     SCEVCheckBlock->getTerminator()->eraseFromParent();
2076     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2077     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2078                                                 SCEVCheckBlock);
2079 
2080     DT->addNewBlock(SCEVCheckBlock, Pred);
2081     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2082 
2083     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2084     if (AddBranchWeights)
2085       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2086     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2087     return SCEVCheckBlock;
2088   }
2089 
2090   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2091   /// the branches to branch to the vector preheader or \p Bypass, depending on
2092   /// the generated condition.
2093   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2094                                    BasicBlock *LoopVectorPreHeader) {
2095     // Check if we generated code that checks in runtime if arrays overlap.
2096     if (!MemRuntimeCheckCond)
2097       return nullptr;
2098 
2099     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2100     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2101                                                 MemCheckBlock);
2102 
2103     DT->addNewBlock(MemCheckBlock, Pred);
2104     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2105     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2106 
2107     if (OuterLoop)
2108       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2109 
2110     BranchInst &BI =
2111         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2112     if (AddBranchWeights) {
2113       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2114     }
2115     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2116     MemCheckBlock->getTerminator()->setDebugLoc(
2117         Pred->getTerminator()->getDebugLoc());
2118 
2119     // Mark the check as used, to prevent it from being removed during cleanup.
2120     MemRuntimeCheckCond = nullptr;
2121     return MemCheckBlock;
2122   }
2123 };
2124 } // namespace
2125 
2126 static bool useActiveLaneMask(TailFoldingStyle Style) {
2127   return Style == TailFoldingStyle::Data ||
2128          Style == TailFoldingStyle::DataAndControlFlow ||
2129          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2130 }
2131 
2132 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2133   return Style == TailFoldingStyle::DataAndControlFlow ||
2134          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2135 }
2136 
2137 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2138 // vectorization. The loop needs to be annotated with #pragma omp simd
2139 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2140 // vector length information is not provided, vectorization is not considered
2141 // explicit. Interleave hints are not allowed either. These limitations will be
2142 // relaxed in the future.
2143 // Please, note that we are currently forced to abuse the pragma 'clang
2144 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2145 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2146 // provides *explicit vectorization hints* (LV can bypass legal checks and
2147 // assume that vectorization is legal). However, both hints are implemented
2148 // using the same metadata (llvm.loop.vectorize, processed by
2149 // LoopVectorizeHints). This will be fixed in the future when the native IR
2150 // representation for pragma 'omp simd' is introduced.
2151 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2152                                    OptimizationRemarkEmitter *ORE) {
2153   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2154   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2155 
2156   // Only outer loops with an explicit vectorization hint are supported.
2157   // Unannotated outer loops are ignored.
2158   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2159     return false;
2160 
2161   Function *Fn = OuterLp->getHeader()->getParent();
2162   if (!Hints.allowVectorization(Fn, OuterLp,
2163                                 true /*VectorizeOnlyWhenForced*/)) {
2164     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2165     return false;
2166   }
2167 
2168   if (Hints.getInterleave() > 1) {
2169     // TODO: Interleave support is future work.
2170     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2171                          "outer loops.\n");
2172     Hints.emitRemarkWithHints();
2173     return false;
2174   }
2175 
2176   return true;
2177 }
2178 
2179 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2180                                   OptimizationRemarkEmitter *ORE,
2181                                   SmallVectorImpl<Loop *> &V) {
2182   // Collect inner loops and outer loops without irreducible control flow. For
2183   // now, only collect outer loops that have explicit vectorization hints. If we
2184   // are stress testing the VPlan H-CFG construction, we collect the outermost
2185   // loop of every loop nest.
2186   if (L.isInnermost() || VPlanBuildStressTest ||
2187       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2188     LoopBlocksRPO RPOT(&L);
2189     RPOT.perform(LI);
2190     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2191       V.push_back(&L);
2192       // TODO: Collect inner loops inside marked outer loops in case
2193       // vectorization fails for the outer loop. Do not invoke
2194       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2195       // already known to be reducible. We can use an inherited attribute for
2196       // that.
2197       return;
2198     }
2199   }
2200   for (Loop *InnerL : L)
2201     collectSupportedLoops(*InnerL, LI, ORE, V);
2202 }
2203 
2204 //===----------------------------------------------------------------------===//
2205 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2206 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2207 //===----------------------------------------------------------------------===//
2208 
2209 /// Compute the transformed value of Index at offset StartValue using step
2210 /// StepValue.
2211 /// For integer induction, returns StartValue + Index * StepValue.
2212 /// For pointer induction, returns StartValue[Index * StepValue].
2213 /// FIXME: The newly created binary instructions should contain nsw/nuw
2214 /// flags, which can be found from the original scalar operations.
2215 static Value *
2216 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2217                      Value *Step,
2218                      InductionDescriptor::InductionKind InductionKind,
2219                      const BinaryOperator *InductionBinOp) {
2220   Type *StepTy = Step->getType();
2221   Value *CastedIndex = StepTy->isIntegerTy()
2222                            ? B.CreateSExtOrTrunc(Index, StepTy)
2223                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2224   if (CastedIndex != Index) {
2225     CastedIndex->setName(CastedIndex->getName() + ".cast");
2226     Index = CastedIndex;
2227   }
2228 
2229   // Note: the IR at this point is broken. We cannot use SE to create any new
2230   // SCEV and then expand it, hoping that SCEV's simplification will give us
2231   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2232   // lead to various SCEV crashes. So all we can do is to use builder and rely
2233   // on InstCombine for future simplifications. Here we handle some trivial
2234   // cases only.
2235   auto CreateAdd = [&B](Value *X, Value *Y) {
2236     assert(X->getType() == Y->getType() && "Types don't match!");
2237     if (auto *CX = dyn_cast<ConstantInt>(X))
2238       if (CX->isZero())
2239         return Y;
2240     if (auto *CY = dyn_cast<ConstantInt>(Y))
2241       if (CY->isZero())
2242         return X;
2243     return B.CreateAdd(X, Y);
2244   };
2245 
2246   // We allow X to be a vector type, in which case Y will potentially be
2247   // splatted into a vector with the same element count.
2248   auto CreateMul = [&B](Value *X, Value *Y) {
2249     assert(X->getType()->getScalarType() == Y->getType() &&
2250            "Types don't match!");
2251     if (auto *CX = dyn_cast<ConstantInt>(X))
2252       if (CX->isOne())
2253         return Y;
2254     if (auto *CY = dyn_cast<ConstantInt>(Y))
2255       if (CY->isOne())
2256         return X;
2257     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2258     if (XVTy && !isa<VectorType>(Y->getType()))
2259       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2260     return B.CreateMul(X, Y);
2261   };
2262 
2263   switch (InductionKind) {
2264   case InductionDescriptor::IK_IntInduction: {
2265     assert(!isa<VectorType>(Index->getType()) &&
2266            "Vector indices not supported for integer inductions yet");
2267     assert(Index->getType() == StartValue->getType() &&
2268            "Index type does not match StartValue type");
2269     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2270       return B.CreateSub(StartValue, Index);
2271     auto *Offset = CreateMul(Index, Step);
2272     return CreateAdd(StartValue, Offset);
2273   }
2274   case InductionDescriptor::IK_PtrInduction:
2275     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2276   case InductionDescriptor::IK_FpInduction: {
2277     assert(!isa<VectorType>(Index->getType()) &&
2278            "Vector indices not supported for FP inductions yet");
2279     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2280     assert(InductionBinOp &&
2281            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2282             InductionBinOp->getOpcode() == Instruction::FSub) &&
2283            "Original bin op should be defined for FP induction");
2284 
2285     Value *MulExp = B.CreateFMul(Step, Index);
2286     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2287                          "induction");
2288   }
2289   case InductionDescriptor::IK_NoInduction:
2290     return nullptr;
2291   }
2292   llvm_unreachable("invalid enum");
2293 }
2294 
2295 std::optional<unsigned> getMaxVScale(const Function &F,
2296                                      const TargetTransformInfo &TTI) {
2297   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2298     return MaxVScale;
2299 
2300   if (F.hasFnAttribute(Attribute::VScaleRange))
2301     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2302 
2303   return std::nullopt;
2304 }
2305 
2306 /// For the given VF and UF and maximum trip count computed for the loop, return
2307 /// whether the induction variable might overflow in the vectorized loop. If not,
2308 /// then we know a runtime overflow check always evaluates to false and can be
2309 /// removed.
2310 static bool isIndvarOverflowCheckKnownFalse(
2311     const LoopVectorizationCostModel *Cost,
2312     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2313   // Always be conservative if we don't know the exact unroll factor.
2314   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2315 
2316   Type *IdxTy = Cost->Legal->getWidestInductionType();
2317   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2318 
2319   // We know the runtime overflow check is known false iff the (max) trip-count
2320   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2321   // the vector loop induction variable.
2322   if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2323     uint64_t MaxVF = VF.getKnownMinValue();
2324     if (VF.isScalable()) {
2325       std::optional<unsigned> MaxVScale =
2326           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2327       if (!MaxVScale)
2328         return false;
2329       MaxVF *= *MaxVScale;
2330     }
2331 
2332     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2333   }
2334 
2335   return false;
2336 }
2337 
2338 // Return whether we allow using masked interleave-groups (for dealing with
2339 // strided loads/stores that reside in predicated blocks, or for dealing
2340 // with gaps).
2341 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2342   // If an override option has been passed in for interleaved accesses, use it.
2343   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2344     return EnableMaskedInterleavedMemAccesses;
2345 
2346   return TTI.enableMaskedInterleavedAccessVectorization();
2347 }
2348 
2349 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2350                                                VPReplicateRecipe *RepRecipe,
2351                                                const VPLane &Lane,
2352                                                VPTransformState &State) {
2353   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2354 
2355   // Does this instruction return a value ?
2356   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2357 
2358   Instruction *Cloned = Instr->clone();
2359   if (!IsVoidRetTy) {
2360     Cloned->setName(Instr->getName() + ".cloned");
2361 #if !defined(NDEBUG)
2362     // Verify that VPlan type inference results agree with the type of the
2363     // generated values.
2364     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2365            "inferred type and type from generated instructions do not match");
2366 #endif
2367   }
2368 
2369   RepRecipe->setFlags(Cloned);
2370 
2371   if (auto DL = Instr->getDebugLoc())
2372     State.setDebugLocFrom(DL);
2373 
2374   // Replace the operands of the cloned instructions with their scalar
2375   // equivalents in the new loop.
2376   for (const auto &I : enumerate(RepRecipe->operands())) {
2377     auto InputLane = Lane;
2378     VPValue *Operand = I.value();
2379     if (vputils::isUniformAfterVectorization(Operand))
2380       InputLane = VPLane::getFirstLane();
2381     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2382   }
2383   State.addNewMetadata(Cloned, Instr);
2384 
2385   // Place the cloned scalar in the new loop.
2386   State.Builder.Insert(Cloned);
2387 
2388   State.set(RepRecipe, Cloned, Lane);
2389 
2390   // If we just cloned a new assumption, add it the assumption cache.
2391   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2392     AC->registerAssumption(II);
2393 
2394   // End if-block.
2395   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2396   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2397   assert(
2398       (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2399        all_of(RepRecipe->operands(),
2400               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2401       "Expected a recipe is either within a region or all of its operands "
2402       "are defined outside the vectorized region.");
2403   if (IfPredicateInstr)
2404     PredicatedInstructions.push_back(Cloned);
2405 }
2406 
2407 Value *
2408 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2409   if (VectorTripCount)
2410     return VectorTripCount;
2411 
2412   Value *TC = getTripCount();
2413   IRBuilder<> Builder(InsertBlock->getTerminator());
2414 
2415   Type *Ty = TC->getType();
2416   // This is where we can make the step a runtime constant.
2417   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2418 
2419   // If the tail is to be folded by masking, round the number of iterations N
2420   // up to a multiple of Step instead of rounding down. This is done by first
2421   // adding Step-1 and then rounding down. Note that it's ok if this addition
2422   // overflows: the vector induction variable will eventually wrap to zero given
2423   // that it starts at zero and its Step is a power of two; the loop will then
2424   // exit, with the last early-exit vector comparison also producing all-true.
2425   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2426   // is accounted for in emitIterationCountCheck that adds an overflow check.
2427   if (Cost->foldTailByMasking()) {
2428     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2429            "VF*UF must be a power of 2 when folding tail by masking");
2430     TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2431                            "n.rnd.up");
2432   }
2433 
2434   // Now we need to generate the expression for the part of the loop that the
2435   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2436   // iterations are not required for correctness, or N - Step, otherwise. Step
2437   // is equal to the vectorization factor (number of SIMD elements) times the
2438   // unroll factor (number of SIMD instructions).
2439   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2440 
2441   // There are cases where we *must* run at least one iteration in the remainder
2442   // loop.  See the cost model for when this can happen.  If the step evenly
2443   // divides the trip count, we set the remainder to be equal to the step. If
2444   // the step does not evenly divide the trip count, no adjustment is necessary
2445   // since there will already be scalar iterations. Note that the minimum
2446   // iterations check ensures that N >= Step.
2447   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2448     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2449     R = Builder.CreateSelect(IsZero, Step, R);
2450   }
2451 
2452   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2453 
2454   return VectorTripCount;
2455 }
2456 
2457 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2458   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2459   VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2460   if (PreVectorPH->getNumSuccessors() != 1) {
2461     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2462     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2463            "Unexpected successor");
2464     VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2465     VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2466     PreVectorPH = CheckVPIRBB;
2467   }
2468   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2469   PreVectorPH->swapSuccessors();
2470 }
2471 
2472 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2473   Value *Count = getTripCount();
2474   // Reuse existing vector loop preheader for TC checks.
2475   // Note that new preheader block is generated for vector loop.
2476   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2477   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2478 
2479   // Generate code to check if the loop's trip count is less than VF * UF, or
2480   // equal to it in case a scalar epilogue is required; this implies that the
2481   // vector trip count is zero. This check also covers the case where adding one
2482   // to the backedge-taken count overflowed leading to an incorrect trip count
2483   // of zero. In this case we will also jump to the scalar loop.
2484   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2485                                                        : ICmpInst::ICMP_ULT;
2486 
2487   // If tail is to be folded, vector loop takes care of all iterations.
2488   Type *CountTy = Count->getType();
2489   Value *CheckMinIters = Builder.getFalse();
2490   auto CreateStep = [&]() -> Value * {
2491     // Create step with max(MinProTripCount, UF * VF).
2492     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2493       return createStepForVF(Builder, CountTy, VF, UF);
2494 
2495     Value *MinProfTC =
2496         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2497     if (!VF.isScalable())
2498       return MinProfTC;
2499     return Builder.CreateBinaryIntrinsic(
2500         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2501   };
2502 
2503   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2504   if (Style == TailFoldingStyle::None) {
2505     Value *Step = CreateStep();
2506     ScalarEvolution &SE = *PSE.getSE();
2507     // TODO: Emit unconditional branch to vector preheader instead of
2508     // conditional branch with known condition.
2509     const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2510     // Check if the trip count is < the step.
2511     if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2512       // TODO: Ensure step is at most the trip count when determining max VF and
2513       // UF, w/o tail folding.
2514       CheckMinIters = Builder.getTrue();
2515     } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),
2516                                     TripCountSCEV, SE.getSCEV(Step))) {
2517       // Generate the minimum iteration check only if we cannot prove the
2518       // check is known to be true, or known to be false.
2519       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2520     } // else step known to be < trip count, use CheckMinIters preset to false.
2521   } else if (VF.isScalable() &&
2522              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2523              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2524     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2525     // an overflow to zero when updating induction variables and so an
2526     // additional overflow check is required before entering the vector loop.
2527 
2528     // Get the maximum unsigned value for the type.
2529     Value *MaxUIntTripCount =
2530         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2531     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2532 
2533     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2534     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2535   }
2536 
2537   // Create new preheader for vector loop.
2538   LoopVectorPreHeader =
2539       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2540                  "vector.ph");
2541 
2542   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2543                                DT->getNode(Bypass)->getIDom()) &&
2544          "TC check is expected to dominate Bypass");
2545 
2546   BranchInst &BI =
2547       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2548   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2549     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2550   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2551   LoopBypassBlocks.push_back(TCCheckBlock);
2552 
2553   // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2554   introduceCheckBlockInVPlan(TCCheckBlock);
2555 }
2556 
2557 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2558   BasicBlock *const SCEVCheckBlock =
2559       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2560   if (!SCEVCheckBlock)
2561     return nullptr;
2562 
2563   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2564            (OptForSizeBasedOnProfile &&
2565             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2566          "Cannot SCEV check stride or overflow when optimizing for size");
2567   assert(!LoopBypassBlocks.empty() &&
2568          "Should already be a bypass block due to iteration count check");
2569   LoopBypassBlocks.push_back(SCEVCheckBlock);
2570   AddedSafetyChecks = true;
2571 
2572   introduceCheckBlockInVPlan(SCEVCheckBlock);
2573   return SCEVCheckBlock;
2574 }
2575 
2576 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2577   // VPlan-native path does not do any analysis for runtime checks currently.
2578   if (EnableVPlanNativePath)
2579     return nullptr;
2580 
2581   BasicBlock *const MemCheckBlock =
2582       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2583 
2584   // Check if we generated code that checks in runtime if arrays overlap. We put
2585   // the checks into a separate block to make the more common case of few
2586   // elements faster.
2587   if (!MemCheckBlock)
2588     return nullptr;
2589 
2590   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2591     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2592            "Cannot emit memory checks when optimizing for size, unless forced "
2593            "to vectorize.");
2594     ORE->emit([&]() {
2595       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2596                                         OrigLoop->getStartLoc(),
2597                                         OrigLoop->getHeader())
2598              << "Code-size may be reduced by not forcing "
2599                 "vectorization, or by source-code modifications "
2600                 "eliminating the need for runtime checks "
2601                 "(e.g., adding 'restrict').";
2602     });
2603   }
2604 
2605   LoopBypassBlocks.push_back(MemCheckBlock);
2606 
2607   AddedSafetyChecks = true;
2608 
2609   introduceCheckBlockInVPlan(MemCheckBlock);
2610   return MemCheckBlock;
2611 }
2612 
2613 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2614 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2615 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2616 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2617 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2618   VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2619   for (auto &R : make_early_inc_range(*VPBB)) {
2620     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2621     R.moveBefore(*IRVPBB, IRVPBB->end());
2622   }
2623 
2624   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2625   // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2626 }
2627 
2628 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2629   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2630   assert(LoopVectorPreHeader && "Invalid loop structure");
2631   assert((OrigLoop->getUniqueLatchExitBlock() ||
2632           Cost->requiresScalarEpilogue(VF.isVector())) &&
2633          "loops not exiting via the latch without required epilogue?");
2634 
2635   LoopMiddleBlock =
2636       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2637                  LI, nullptr, Twine(Prefix) + "middle.block");
2638   replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
2639   LoopScalarPreHeader =
2640       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2641                  nullptr, Twine(Prefix) + "scalar.ph");
2642   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
2643 }
2644 
2645 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2646 /// expansion results.
2647 static Value *getExpandedStep(const InductionDescriptor &ID,
2648                               const SCEV2ValueTy &ExpandedSCEVs) {
2649   const SCEV *Step = ID.getStep();
2650   if (auto *C = dyn_cast<SCEVConstant>(Step))
2651     return C->getValue();
2652   if (auto *U = dyn_cast<SCEVUnknown>(Step))
2653     return U->getValue();
2654   auto I = ExpandedSCEVs.find(Step);
2655   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2656   return I->second;
2657 }
2658 
2659 /// Knowing that loop \p L executes a single vector iteration, add instructions
2660 /// that will get simplified and thus should not have any cost to \p
2661 /// InstsToIgnore.
2662 static void addFullyUnrolledInstructionsToIgnore(
2663     Loop *L, const LoopVectorizationLegality::InductionList &IL,
2664     SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2665   auto *Cmp = L->getLatchCmpInst();
2666   if (Cmp)
2667     InstsToIgnore.insert(Cmp);
2668   for (const auto &KV : IL) {
2669     // Extract the key by hand so that it can be used in the lambda below.  Note
2670     // that captured structured bindings are a C++20 extension.
2671     const PHINode *IV = KV.first;
2672 
2673     // Get next iteration value of the induction variable.
2674     Instruction *IVInst =
2675         cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2676     if (all_of(IVInst->users(),
2677                [&](const User *U) { return U == IV || U == Cmp; }))
2678       InstsToIgnore.insert(IVInst);
2679   }
2680 }
2681 
2682 void InnerLoopVectorizer::createInductionAdditionalBypassValues(
2683     const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2684   assert(MainVectorTripCount && "Must have bypass information");
2685 
2686   Instruction *OldInduction = Legal->getPrimaryInduction();
2687   IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2688                             getAdditionalBypassBlock()->getFirstInsertionPt());
2689   for (const auto &InductionEntry : Legal->getInductionVars()) {
2690     PHINode *OrigPhi = InductionEntry.first;
2691     const InductionDescriptor &II = InductionEntry.second;
2692     Value *Step = getExpandedStep(II, ExpandedSCEVs);
2693     // For the primary induction the additional bypass end value is known.
2694     // Otherwise it is computed.
2695     Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2696     if (OrigPhi != OldInduction) {
2697       auto *BinOp = II.getInductionBinOp();
2698       // Fast-math-flags propagate from the original induction instruction.
2699       if (isa_and_nonnull<FPMathOperator>(BinOp))
2700         BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2701 
2702       // Compute the end value for the additional bypass.
2703       EndValueFromAdditionalBypass =
2704           emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2705                                II.getStartValue(), Step, II.getKind(), BinOp);
2706       EndValueFromAdditionalBypass->setName("ind.end");
2707     }
2708 
2709     // Store the bypass value here, as it needs to be added as operand to its
2710     // scalar preheader phi node after the epilogue skeleton has been created.
2711     // TODO: Directly add as extra operand to the VPResumePHI recipe.
2712     assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2713            "entry for OrigPhi already exits");
2714     Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2715   }
2716 }
2717 
2718 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2719     const SCEV2ValueTy &ExpandedSCEVs) {
2720   /*
2721    In this function we generate a new loop. The new loop will contain
2722    the vectorized instructions while the old loop will continue to run the
2723    scalar remainder.
2724 
2725        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2726      /  |      preheader are expanded here. Eventually all required SCEV
2727     /   |      expansion should happen here.
2728    /    v
2729   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2730   |  /  |
2731   | /   v
2732   ||   [ ]     <-- vector pre header.
2733   |/    |
2734   |     v
2735   |    [  ] \
2736   |    [  ]_|   <-- vector loop (created during VPlan execution).
2737   |     |
2738   |     v
2739   \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2740    |    |                       successors created during VPlan execution)
2741    \/   |
2742    /\   v
2743    | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
2744    |    |
2745  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
2746    |   [ ] \
2747    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue, header
2748    |    |          wrapped in VPIRBasicBlock).
2749     \   |
2750      \  v
2751       >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
2752    ...
2753    */
2754 
2755   // Create an empty vector loop, and prepare basic blocks for the runtime
2756   // checks.
2757   createVectorLoopSkeleton("");
2758 
2759   // Now, compare the new count to zero. If it is zero skip the vector loop and
2760   // jump to the scalar loop. This check also covers the case where the
2761   // backedge-taken count is uint##_max: adding one to it will overflow leading
2762   // to an incorrect trip count of zero. In this (rare) case we will also jump
2763   // to the scalar loop.
2764   emitIterationCountCheck(LoopScalarPreHeader);
2765 
2766   // Generate the code to check any assumptions that we've made for SCEV
2767   // expressions.
2768   emitSCEVChecks(LoopScalarPreHeader);
2769 
2770   // Generate the code that checks in runtime if arrays overlap. We put the
2771   // checks into a separate block to make the more common case of few elements
2772   // faster.
2773   emitMemRuntimeChecks(LoopScalarPreHeader);
2774 
2775   return LoopVectorPreHeader;
2776 }
2777 
2778 // Fix up external users of the induction variable. At this point, we are
2779 // in LCSSA form, with all external PHIs that use the IV having one input value,
2780 // coming from the remainder loop. We need those PHIs to also have a correct
2781 // value for the IV when arriving directly from the middle block.
2782 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2783                                        const InductionDescriptor &II,
2784                                        Value *VectorTripCount,
2785                                        BasicBlock *MiddleBlock,
2786                                        VPTransformState &State) {
2787   // There are two kinds of external IV usages - those that use the value
2788   // computed in the last iteration (the PHI) and those that use the penultimate
2789   // value (the value that feeds into the phi from the loop latch).
2790   // We allow both, but they, obviously, have different values.
2791 
2792   DenseMap<Value *, Value *> MissingVals;
2793 
2794   Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
2795                                       OrigLoop->getLoopPreheader()))
2796                         ->getIncomingValueForBlock(MiddleBlock);
2797 
2798   // An external user of the last iteration's value should see the value that
2799   // the remainder loop uses to initialize its own IV.
2800   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2801   for (User *U : PostInc->users()) {
2802     Instruction *UI = cast<Instruction>(U);
2803     if (!OrigLoop->contains(UI)) {
2804       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2805       MissingVals[UI] = EndValue;
2806     }
2807   }
2808 
2809   // An external user of the penultimate value need to see EndValue - Step.
2810   // The simplest way to get this is to recompute it from the constituent SCEVs,
2811   // that is Start + (Step * (CRD - 1)).
2812   for (User *U : OrigPhi->users()) {
2813     auto *UI = cast<Instruction>(U);
2814     if (!OrigLoop->contains(UI)) {
2815       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2816       IRBuilder<> B(MiddleBlock->getTerminator());
2817 
2818       // Fast-math-flags propagate from the original induction instruction.
2819       if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
2820         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2821 
2822       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2823       assert(StepVPV && "step must have been expanded during VPlan execution");
2824       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2825                                         : State.get(StepVPV, VPLane(0));
2826       Value *Escape = nullptr;
2827       if (EndValue->getType()->isIntegerTy())
2828         Escape = B.CreateSub(EndValue, Step);
2829       else if (EndValue->getType()->isPointerTy())
2830         Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
2831       else {
2832         assert(EndValue->getType()->isFloatingPointTy() &&
2833                "Unexpected induction type");
2834         Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
2835                                        Instruction::FAdd
2836                                    ? Instruction::FSub
2837                                    : Instruction::FAdd,
2838                                EndValue, Step);
2839       }
2840       Escape->setName("ind.escape");
2841       MissingVals[UI] = Escape;
2842     }
2843   }
2844 
2845   assert((MissingVals.empty() ||
2846           all_of(MissingVals,
2847                  [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
2848                    return all_of(
2849                        predecessors(cast<Instruction>(P.first)->getParent()),
2850                        [MiddleBlock, this](BasicBlock *Pred) {
2851                          return Pred == MiddleBlock ||
2852                                 Pred == OrigLoop->getLoopLatch();
2853                        });
2854                  })) &&
2855          "Expected escaping values from latch/middle.block only");
2856 
2857   for (auto &I : MissingVals) {
2858     PHINode *PHI = cast<PHINode>(I.first);
2859     // One corner case we have to handle is two IVs "chasing" each-other,
2860     // that is %IV2 = phi [...], [ %IV1, %latch ]
2861     // In this case, if IV1 has an external use, we need to avoid adding both
2862     // "last value of IV1" and "penultimate value of IV2". So, verify that we
2863     // don't already have an incoming value for the middle block.
2864     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2865       PHI->addIncoming(I.second, MiddleBlock);
2866   }
2867 }
2868 
2869 namespace {
2870 
2871 struct CSEDenseMapInfo {
2872   static bool canHandle(const Instruction *I) {
2873     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2874            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2875   }
2876 
2877   static inline Instruction *getEmptyKey() {
2878     return DenseMapInfo<Instruction *>::getEmptyKey();
2879   }
2880 
2881   static inline Instruction *getTombstoneKey() {
2882     return DenseMapInfo<Instruction *>::getTombstoneKey();
2883   }
2884 
2885   static unsigned getHashValue(const Instruction *I) {
2886     assert(canHandle(I) && "Unknown instruction!");
2887     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2888                                                            I->value_op_end()));
2889   }
2890 
2891   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2892     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2893         LHS == getTombstoneKey() || RHS == getTombstoneKey())
2894       return LHS == RHS;
2895     return LHS->isIdenticalTo(RHS);
2896   }
2897 };
2898 
2899 } // end anonymous namespace
2900 
2901 ///Perform cse of induction variable instructions.
2902 static void cse(BasicBlock *BB) {
2903   // Perform simple cse.
2904   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2905   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2906     if (!CSEDenseMapInfo::canHandle(&In))
2907       continue;
2908 
2909     // Check if we can replace this instruction with any of the
2910     // visited instructions.
2911     if (Instruction *V = CSEMap.lookup(&In)) {
2912       In.replaceAllUsesWith(V);
2913       In.eraseFromParent();
2914       continue;
2915     }
2916 
2917     CSEMap[&In] = &In;
2918   }
2919 }
2920 
2921 InstructionCost
2922 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2923                                               ElementCount VF) const {
2924   // We only need to calculate a cost if the VF is scalar; for actual vectors
2925   // we should already have a pre-calculated cost at each VF.
2926   if (!VF.isScalar())
2927     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2928 
2929   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2930   Type *RetTy = CI->getType();
2931   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2932     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2933       return *RedCost;
2934 
2935   SmallVector<Type *, 4> Tys;
2936   for (auto &ArgOp : CI->args())
2937     Tys.push_back(ArgOp->getType());
2938 
2939   InstructionCost ScalarCallCost =
2940       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2941 
2942   // If this is an intrinsic we may have a lower cost for it.
2943   if (getVectorIntrinsicIDForCall(CI, TLI)) {
2944     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2945     return std::min(ScalarCallCost, IntrinsicCost);
2946   }
2947   return ScalarCallCost;
2948 }
2949 
2950 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2951   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2952     return Elt;
2953   return VectorType::get(Elt, VF);
2954 }
2955 
2956 InstructionCost
2957 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2958                                                    ElementCount VF) const {
2959   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2960   assert(ID && "Expected intrinsic call!");
2961   Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2962   FastMathFlags FMF;
2963   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2964     FMF = FPMO->getFastMathFlags();
2965 
2966   SmallVector<const Value *> Arguments(CI->args());
2967   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2968   SmallVector<Type *> ParamTys;
2969   std::transform(FTy->param_begin(), FTy->param_end(),
2970                  std::back_inserter(ParamTys),
2971                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2972 
2973   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2974                                     dyn_cast<IntrinsicInst>(CI));
2975   return TTI.getIntrinsicInstrCost(CostAttrs,
2976                                    TargetTransformInfo::TCK_RecipThroughput);
2977 }
2978 
2979 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2980   // Fix widened non-induction PHIs by setting up the PHI operands.
2981   if (EnableVPlanNativePath)
2982     fixNonInductionPHIs(State);
2983 
2984   // Forget the original basic block.
2985   PSE.getSE()->forgetLoop(OrigLoop);
2986   PSE.getSE()->forgetBlockAndLoopDispositions();
2987 
2988   // After vectorization, the exit blocks of the original loop will have
2989   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2990   // looked through single-entry phis.
2991   SmallVector<BasicBlock *> ExitBlocks;
2992   OrigLoop->getExitBlocks(ExitBlocks);
2993   for (BasicBlock *Exit : ExitBlocks)
2994     for (PHINode &PN : Exit->phis())
2995       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2996 
2997   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2998     // No edge from the middle block to the unique exit block has been inserted
2999     // and there is nothing to fix from vector loop; phis should have incoming
3000     // from scalar loop only.
3001   } else {
3002     // TODO: Check in VPlan to see if IV users need fixing instead of checking
3003     // the cost model.
3004 
3005     // If we inserted an edge from the middle block to the unique exit block,
3006     // update uses outside the loop (phis) to account for the newly inserted
3007     // edge.
3008 
3009     // Fix-up external users of the induction variables.
3010     for (const auto &Entry : Legal->getInductionVars())
3011       fixupIVUsers(Entry.first, Entry.second,
3012                    getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
3013   }
3014 
3015   // Don't apply optimizations below when no vector region remains, as they all
3016   // require a vector loop at the moment.
3017   if (!State.Plan->getVectorLoopRegion())
3018     return;
3019 
3020   for (Instruction *PI : PredicatedInstructions)
3021     sinkScalarOperands(&*PI);
3022 
3023   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3024   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3025   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
3026 
3027   // Remove redundant induction instructions.
3028   cse(HeaderBB);
3029 
3030   // Set/update profile weights for the vector and remainder loops as original
3031   // loop iterations are now distributed among them. Note that original loop
3032   // becomes the scalar remainder loop after vectorization.
3033   //
3034   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3035   // end up getting slightly roughened result but that should be OK since
3036   // profile is not inherently precise anyway. Note also possible bypass of
3037   // vector code caused by legality checks is ignored, assigning all the weight
3038   // to the vector loop, optimistically.
3039   //
3040   // For scalable vectorization we can't know at compile time how many
3041   // iterations of the loop are handled in one vector iteration, so instead
3042   // assume a pessimistic vscale of '1'.
3043   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
3044   setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
3045                                VF.getKnownMinValue() * UF);
3046 }
3047 
3048 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3049   // The basic block and loop containing the predicated instruction.
3050   auto *PredBB = PredInst->getParent();
3051   auto *VectorLoop = LI->getLoopFor(PredBB);
3052 
3053   // Initialize a worklist with the operands of the predicated instruction.
3054   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3055 
3056   // Holds instructions that we need to analyze again. An instruction may be
3057   // reanalyzed if we don't yet know if we can sink it or not.
3058   SmallVector<Instruction *, 8> InstsToReanalyze;
3059 
3060   // Returns true if a given use occurs in the predicated block. Phi nodes use
3061   // their operands in their corresponding predecessor blocks.
3062   auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
3063     auto *I = cast<Instruction>(U.getUser());
3064     BasicBlock *BB = I->getParent();
3065     if (auto *Phi = dyn_cast<PHINode>(I))
3066       BB = Phi->getIncomingBlock(
3067           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3068     return BB == PredBB;
3069   };
3070 
3071   // Iteratively sink the scalarized operands of the predicated instruction
3072   // into the block we created for it. When an instruction is sunk, it's
3073   // operands are then added to the worklist. The algorithm ends after one pass
3074   // through the worklist doesn't sink a single instruction.
3075   bool Changed;
3076   do {
3077     // Add the instructions that need to be reanalyzed to the worklist, and
3078     // reset the changed indicator.
3079     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3080     InstsToReanalyze.clear();
3081     Changed = false;
3082 
3083     while (!Worklist.empty()) {
3084       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3085 
3086       // We can't sink an instruction if it is a phi node, is not in the loop,
3087       // may have side effects or may read from memory.
3088       // TODO: Could do more granular checking to allow sinking
3089       // a load past non-store instructions.
3090       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3091           I->mayHaveSideEffects() || I->mayReadFromMemory())
3092           continue;
3093 
3094       // If the instruction is already in PredBB, check if we can sink its
3095       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3096       // sinking the scalar instruction I, hence it appears in PredBB; but it
3097       // may have failed to sink I's operands (recursively), which we try
3098       // (again) here.
3099       if (I->getParent() == PredBB) {
3100         Worklist.insert(I->op_begin(), I->op_end());
3101         continue;
3102       }
3103 
3104       // It's legal to sink the instruction if all its uses occur in the
3105       // predicated block. Otherwise, there's nothing to do yet, and we may
3106       // need to reanalyze the instruction.
3107       if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
3108         InstsToReanalyze.push_back(I);
3109         continue;
3110       }
3111 
3112       // Move the instruction to the beginning of the predicated block, and add
3113       // it's operands to the worklist.
3114       I->moveBefore(&*PredBB->getFirstInsertionPt());
3115       Worklist.insert(I->op_begin(), I->op_end());
3116 
3117       // The sinking may have enabled other instructions to be sunk, so we will
3118       // need to iterate.
3119       Changed = true;
3120     }
3121   } while (Changed);
3122 }
3123 
3124 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
3125   auto Iter = vp_depth_first_deep(Plan.getEntry());
3126   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3127     for (VPRecipeBase &P : VPBB->phis()) {
3128       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3129       if (!VPPhi)
3130         continue;
3131       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3132       // Make sure the builder has a valid insert point.
3133       Builder.SetInsertPoint(NewPhi);
3134       for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3135         VPValue *Inc = VPPhi->getIncomingValue(Idx);
3136         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3137         NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3138       }
3139     }
3140   }
3141 }
3142 
3143 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3144   // We should not collect Scalars more than once per VF. Right now, this
3145   // function is called from collectUniformsAndScalars(), which already does
3146   // this check. Collecting Scalars for VF=1 does not make any sense.
3147   assert(VF.isVector() && !Scalars.contains(VF) &&
3148          "This function should not be visited twice for the same VF");
3149 
3150   // This avoids any chances of creating a REPLICATE recipe during planning
3151   // since that would result in generation of scalarized code during execution,
3152   // which is not supported for scalable vectors.
3153   if (VF.isScalable()) {
3154     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3155     return;
3156   }
3157 
3158   SmallSetVector<Instruction *, 8> Worklist;
3159 
3160   // These sets are used to seed the analysis with pointers used by memory
3161   // accesses that will remain scalar.
3162   SmallSetVector<Instruction *, 8> ScalarPtrs;
3163   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3164   auto *Latch = TheLoop->getLoopLatch();
3165 
3166   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3167   // The pointer operands of loads and stores will be scalar as long as the
3168   // memory access is not a gather or scatter operation. The value operand of a
3169   // store will remain scalar if the store is scalarized.
3170   auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3171     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3172     assert(WideningDecision != CM_Unknown &&
3173            "Widening decision should be ready at this moment");
3174     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3175       if (Ptr == Store->getValueOperand())
3176         return WideningDecision == CM_Scalarize;
3177     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3178            "Ptr is neither a value or pointer operand");
3179     return WideningDecision != CM_GatherScatter;
3180   };
3181 
3182   // A helper that returns true if the given value is a getelementptr
3183   // instruction contained in the loop.
3184   auto IsLoopVaryingGEP = [&](Value *V) {
3185     return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3186   };
3187 
3188   // A helper that evaluates a memory access's use of a pointer. If the use will
3189   // be a scalar use and the pointer is only used by memory accesses, we place
3190   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3191   // PossibleNonScalarPtrs.
3192   auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3193     // We only care about bitcast and getelementptr instructions contained in
3194     // the loop.
3195     if (!IsLoopVaryingGEP(Ptr))
3196       return;
3197 
3198     // If the pointer has already been identified as scalar (e.g., if it was
3199     // also identified as uniform), there's nothing to do.
3200     auto *I = cast<Instruction>(Ptr);
3201     if (Worklist.count(I))
3202       return;
3203 
3204     // If the use of the pointer will be a scalar use, and all users of the
3205     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3206     // place the pointer in PossibleNonScalarPtrs.
3207     if (IsScalarUse(MemAccess, Ptr) &&
3208         all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3209       ScalarPtrs.insert(I);
3210     else
3211       PossibleNonScalarPtrs.insert(I);
3212   };
3213 
3214   // We seed the scalars analysis with three classes of instructions: (1)
3215   // instructions marked uniform-after-vectorization and (2) bitcast,
3216   // getelementptr and (pointer) phi instructions used by memory accesses
3217   // requiring a scalar use.
3218   //
3219   // (1) Add to the worklist all instructions that have been identified as
3220   // uniform-after-vectorization.
3221   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3222 
3223   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3224   // memory accesses requiring a scalar use. The pointer operands of loads and
3225   // stores will be scalar unless the operation is a gather or scatter.
3226   // The value operand of a store will remain scalar if the store is scalarized.
3227   for (auto *BB : TheLoop->blocks())
3228     for (auto &I : *BB) {
3229       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3230         EvaluatePtrUse(Load, Load->getPointerOperand());
3231       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3232         EvaluatePtrUse(Store, Store->getPointerOperand());
3233         EvaluatePtrUse(Store, Store->getValueOperand());
3234       }
3235     }
3236   for (auto *I : ScalarPtrs)
3237     if (!PossibleNonScalarPtrs.count(I)) {
3238       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3239       Worklist.insert(I);
3240     }
3241 
3242   // Insert the forced scalars.
3243   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3244   // induction variable when the PHI user is scalarized.
3245   auto ForcedScalar = ForcedScalars.find(VF);
3246   if (ForcedScalar != ForcedScalars.end())
3247     for (auto *I : ForcedScalar->second) {
3248       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3249       Worklist.insert(I);
3250     }
3251 
3252   // Expand the worklist by looking through any bitcasts and getelementptr
3253   // instructions we've already identified as scalar. This is similar to the
3254   // expansion step in collectLoopUniforms(); however, here we're only
3255   // expanding to include additional bitcasts and getelementptr instructions.
3256   unsigned Idx = 0;
3257   while (Idx != Worklist.size()) {
3258     Instruction *Dst = Worklist[Idx++];
3259     if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3260       continue;
3261     auto *Src = cast<Instruction>(Dst->getOperand(0));
3262     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3263           auto *J = cast<Instruction>(U);
3264           return !TheLoop->contains(J) || Worklist.count(J) ||
3265                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3266                   IsScalarUse(J, Src));
3267         })) {
3268       Worklist.insert(Src);
3269       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3270     }
3271   }
3272 
3273   // An induction variable will remain scalar if all users of the induction
3274   // variable and induction variable update remain scalar.
3275   for (const auto &Induction : Legal->getInductionVars()) {
3276     auto *Ind = Induction.first;
3277     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3278 
3279     // If tail-folding is applied, the primary induction variable will be used
3280     // to feed a vector compare.
3281     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3282       continue;
3283 
3284     // Returns true if \p Indvar is a pointer induction that is used directly by
3285     // load/store instruction \p I.
3286     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3287                                               Instruction *I) {
3288       return Induction.second.getKind() ==
3289                  InductionDescriptor::IK_PtrInduction &&
3290              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3291              Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3292     };
3293 
3294     // Determine if all users of the induction variable are scalar after
3295     // vectorization.
3296     bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3297       auto *I = cast<Instruction>(U);
3298       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3299              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3300     });
3301     if (!ScalarInd)
3302       continue;
3303 
3304     // If the induction variable update is a fixed-order recurrence, neither the
3305     // induction variable or its update should be marked scalar after
3306     // vectorization.
3307     auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3308     if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3309       continue;
3310 
3311     // Determine if all users of the induction variable update instruction are
3312     // scalar after vectorization.
3313     bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3314       auto *I = cast<Instruction>(U);
3315       return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3316              IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3317     });
3318     if (!ScalarIndUpdate)
3319       continue;
3320 
3321     // The induction variable and its update instruction will remain scalar.
3322     Worklist.insert(Ind);
3323     Worklist.insert(IndUpdate);
3324     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3325     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3326                       << "\n");
3327   }
3328 
3329   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3330 }
3331 
3332 bool LoopVectorizationCostModel::isScalarWithPredication(
3333     Instruction *I, ElementCount VF) const {
3334   if (!isPredicatedInst(I))
3335     return false;
3336 
3337   // Do we have a non-scalar lowering for this predicated
3338   // instruction? No - it is scalar with predication.
3339   switch(I->getOpcode()) {
3340   default:
3341     return true;
3342   case Instruction::Call:
3343     if (VF.isScalar())
3344       return true;
3345     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3346                .Kind == CM_Scalarize;
3347   case Instruction::Load:
3348   case Instruction::Store: {
3349     auto *Ptr = getLoadStorePointerOperand(I);
3350     auto *Ty = getLoadStoreType(I);
3351     Type *VTy = Ty;
3352     if (VF.isVector())
3353       VTy = VectorType::get(Ty, VF);
3354     const Align Alignment = getLoadStoreAlignment(I);
3355     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3356                                 TTI.isLegalMaskedGather(VTy, Alignment))
3357                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3358                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3359   }
3360   case Instruction::UDiv:
3361   case Instruction::SDiv:
3362   case Instruction::SRem:
3363   case Instruction::URem: {
3364     // We have the option to use the safe-divisor idiom to avoid predication.
3365     // The cost based decision here will always select safe-divisor for
3366     // scalable vectors as scalarization isn't legal.
3367     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3368     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3369   }
3370   }
3371 }
3372 
3373 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3374 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3375   // If predication is not needed, avoid it.
3376   // TODO: We can use the loop-preheader as context point here and get
3377   // context sensitive reasoning for isSafeToSpeculativelyExecute.
3378   if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3379       isSafeToSpeculativelyExecute(I) ||
3380       (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3381       isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3382     return false;
3383 
3384   // If the instruction was executed conditionally in the original scalar loop,
3385   // predication is needed with a mask whose lanes are all possibly inactive.
3386   if (Legal->blockNeedsPredication(I->getParent()))
3387     return true;
3388 
3389   // All that remain are instructions with side-effects originally executed in
3390   // the loop unconditionally, but now execute under a tail-fold mask (only)
3391   // having at least one active lane (the first). If the side-effects of the
3392   // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3393   // - it will cause the same side-effects as when masked.
3394   switch(I->getOpcode()) {
3395   default:
3396     llvm_unreachable(
3397         "instruction should have been considered by earlier checks");
3398   case Instruction::Call:
3399     // Side-effects of a Call are assumed to be non-invariant, needing a
3400     // (fold-tail) mask.
3401     assert(Legal->isMaskRequired(I) &&
3402            "should have returned earlier for calls not needing a mask");
3403     return true;
3404   case Instruction::Load:
3405     // If the address is loop invariant no predication is needed.
3406     return !Legal->isInvariant(getLoadStorePointerOperand(I));
3407   case Instruction::Store: {
3408     // For stores, we need to prove both speculation safety (which follows from
3409     // the same argument as loads), but also must prove the value being stored
3410     // is correct.  The easiest form of the later is to require that all values
3411     // stored are the same.
3412     return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3413              TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3414   }
3415   case Instruction::UDiv:
3416   case Instruction::SDiv:
3417   case Instruction::SRem:
3418   case Instruction::URem:
3419     // If the divisor is loop-invariant no predication is needed.
3420     return !TheLoop->isLoopInvariant(I->getOperand(1));
3421   }
3422 }
3423 
3424 std::pair<InstructionCost, InstructionCost>
3425 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3426                                                     ElementCount VF) const {
3427   assert(I->getOpcode() == Instruction::UDiv ||
3428          I->getOpcode() == Instruction::SDiv ||
3429          I->getOpcode() == Instruction::SRem ||
3430          I->getOpcode() == Instruction::URem);
3431   assert(!isSafeToSpeculativelyExecute(I));
3432 
3433   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3434 
3435   // Scalarization isn't legal for scalable vector types
3436   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3437   if (!VF.isScalable()) {
3438     // Get the scalarization cost and scale this amount by the probability of
3439     // executing the predicated block. If the instruction is not predicated,
3440     // we fall through to the next case.
3441     ScalarizationCost = 0;
3442 
3443     // These instructions have a non-void type, so account for the phi nodes
3444     // that we will create. This cost is likely to be zero. The phi node
3445     // cost, if any, should be scaled by the block probability because it
3446     // models a copy at the end of each predicated block.
3447     ScalarizationCost += VF.getKnownMinValue() *
3448       TTI.getCFInstrCost(Instruction::PHI, CostKind);
3449 
3450     // The cost of the non-predicated instruction.
3451     ScalarizationCost += VF.getKnownMinValue() *
3452       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3453 
3454     // The cost of insertelement and extractelement instructions needed for
3455     // scalarization.
3456     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3457 
3458     // Scale the cost by the probability of executing the predicated blocks.
3459     // This assumes the predicated block for each vector lane is equally
3460     // likely.
3461     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3462   }
3463   InstructionCost SafeDivisorCost = 0;
3464 
3465   auto *VecTy = toVectorTy(I->getType(), VF);
3466 
3467   // The cost of the select guard to ensure all lanes are well defined
3468   // after we speculate above any internal control flow.
3469   SafeDivisorCost +=
3470       TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3471                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3472                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
3473 
3474   // Certain instructions can be cheaper to vectorize if they have a constant
3475   // second vector operand. One example of this are shifts on x86.
3476   Value *Op2 = I->getOperand(1);
3477   auto Op2Info = TTI.getOperandInfo(Op2);
3478   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3479       Legal->isInvariant(Op2))
3480     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3481 
3482   SmallVector<const Value *, 4> Operands(I->operand_values());
3483   SafeDivisorCost += TTI.getArithmeticInstrCost(
3484     I->getOpcode(), VecTy, CostKind,
3485     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3486     Op2Info, Operands, I);
3487   return {ScalarizationCost, SafeDivisorCost};
3488 }
3489 
3490 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3491     Instruction *I, ElementCount VF) const {
3492   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3493   assert(getWideningDecision(I, VF) == CM_Unknown &&
3494          "Decision should not be set yet.");
3495   auto *Group = getInterleavedAccessGroup(I);
3496   assert(Group && "Must have a group.");
3497   unsigned InterleaveFactor = Group->getFactor();
3498 
3499   // If the instruction's allocated size doesn't equal its type size, it
3500   // requires padding and will be scalarized.
3501   auto &DL = I->getDataLayout();
3502   auto *ScalarTy = getLoadStoreType(I);
3503   if (hasIrregularType(ScalarTy, DL))
3504     return false;
3505 
3506   // We currently only know how to emit interleave/deinterleave with
3507   // Factor=2 for scalable vectors. This is purely an implementation
3508   // limit.
3509   if (VF.isScalable() && InterleaveFactor != 2)
3510     return false;
3511 
3512   // If the group involves a non-integral pointer, we may not be able to
3513   // losslessly cast all values to a common type.
3514   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3515   for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3516     Instruction *Member = Group->getMember(Idx);
3517     if (!Member)
3518       continue;
3519     auto *MemberTy = getLoadStoreType(Member);
3520     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3521     // Don't coerce non-integral pointers to integers or vice versa.
3522     if (MemberNI != ScalarNI)
3523       // TODO: Consider adding special nullptr value case here
3524       return false;
3525     if (MemberNI && ScalarNI &&
3526         ScalarTy->getPointerAddressSpace() !=
3527             MemberTy->getPointerAddressSpace())
3528       return false;
3529   }
3530 
3531   // Check if masking is required.
3532   // A Group may need masking for one of two reasons: it resides in a block that
3533   // needs predication, or it was decided to use masking to deal with gaps
3534   // (either a gap at the end of a load-access that may result in a speculative
3535   // load, or any gaps in a store-access).
3536   bool PredicatedAccessRequiresMasking =
3537       blockNeedsPredicationForAnyReason(I->getParent()) &&
3538       Legal->isMaskRequired(I);
3539   bool LoadAccessWithGapsRequiresEpilogMasking =
3540       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3541       !isScalarEpilogueAllowed();
3542   bool StoreAccessWithGapsRequiresMasking =
3543       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3544   if (!PredicatedAccessRequiresMasking &&
3545       !LoadAccessWithGapsRequiresEpilogMasking &&
3546       !StoreAccessWithGapsRequiresMasking)
3547     return true;
3548 
3549   // If masked interleaving is required, we expect that the user/target had
3550   // enabled it, because otherwise it either wouldn't have been created or
3551   // it should have been invalidated by the CostModel.
3552   assert(useMaskedInterleavedAccesses(TTI) &&
3553          "Masked interleave-groups for predicated accesses are not enabled.");
3554 
3555   if (Group->isReverse())
3556     return false;
3557 
3558   auto *Ty = getLoadStoreType(I);
3559   const Align Alignment = getLoadStoreAlignment(I);
3560   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3561                           : TTI.isLegalMaskedStore(Ty, Alignment);
3562 }
3563 
3564 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3565     Instruction *I, ElementCount VF) {
3566   // Get and ensure we have a valid memory instruction.
3567   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3568 
3569   auto *Ptr = getLoadStorePointerOperand(I);
3570   auto *ScalarTy = getLoadStoreType(I);
3571 
3572   // In order to be widened, the pointer should be consecutive, first of all.
3573   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3574     return false;
3575 
3576   // If the instruction is a store located in a predicated block, it will be
3577   // scalarized.
3578   if (isScalarWithPredication(I, VF))
3579     return false;
3580 
3581   // If the instruction's allocated size doesn't equal it's type size, it
3582   // requires padding and will be scalarized.
3583   auto &DL = I->getDataLayout();
3584   if (hasIrregularType(ScalarTy, DL))
3585     return false;
3586 
3587   return true;
3588 }
3589 
3590 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3591   // We should not collect Uniforms more than once per VF. Right now,
3592   // this function is called from collectUniformsAndScalars(), which
3593   // already does this check. Collecting Uniforms for VF=1 does not make any
3594   // sense.
3595 
3596   assert(VF.isVector() && !Uniforms.contains(VF) &&
3597          "This function should not be visited twice for the same VF");
3598 
3599   // Visit the list of Uniforms. If we find no uniform value, we won't
3600   // analyze again.  Uniforms.count(VF) will return 1.
3601   Uniforms[VF].clear();
3602 
3603   // Now we know that the loop is vectorizable!
3604   // Collect instructions inside the loop that will remain uniform after
3605   // vectorization.
3606 
3607   // Global values, params and instructions outside of current loop are out of
3608   // scope.
3609   auto IsOutOfScope = [&](Value *V) -> bool {
3610     Instruction *I = dyn_cast<Instruction>(V);
3611     return (!I || !TheLoop->contains(I));
3612   };
3613 
3614   // Worklist containing uniform instructions demanding lane 0.
3615   SetVector<Instruction *> Worklist;
3616 
3617   // Add uniform instructions demanding lane 0 to the worklist. Instructions
3618   // that require predication must not be considered uniform after
3619   // vectorization, because that would create an erroneous replicating region
3620   // where only a single instance out of VF should be formed.
3621   auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3622     if (IsOutOfScope(I)) {
3623       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3624                         << *I << "\n");
3625       return;
3626     }
3627     if (isPredicatedInst(I)) {
3628       LLVM_DEBUG(
3629           dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3630                  << "\n");
3631       return;
3632     }
3633     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3634     Worklist.insert(I);
3635   };
3636 
3637   // Start with the conditional branches exiting the loop. If the branch
3638   // condition is an instruction contained in the loop that is only used by the
3639   // branch, it is uniform. Note conditions from uncountable early exits are not
3640   // uniform.
3641   SmallVector<BasicBlock *> Exiting;
3642   TheLoop->getExitingBlocks(Exiting);
3643   for (BasicBlock *E : Exiting) {
3644     if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3645       continue;
3646     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3647     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3648       AddToWorklistIfAllowed(Cmp);
3649   }
3650 
3651   auto PrevVF = VF.divideCoefficientBy(2);
3652   // Return true if all lanes perform the same memory operation, and we can
3653   // thus choose to execute only one.
3654   auto IsUniformMemOpUse = [&](Instruction *I) {
3655     // If the value was already known to not be uniform for the previous
3656     // (smaller VF), it cannot be uniform for the larger VF.
3657     if (PrevVF.isVector()) {
3658       auto Iter = Uniforms.find(PrevVF);
3659       if (Iter != Uniforms.end() && !Iter->second.contains(I))
3660         return false;
3661     }
3662     if (!Legal->isUniformMemOp(*I, VF))
3663       return false;
3664     if (isa<LoadInst>(I))
3665       // Loading the same address always produces the same result - at least
3666       // assuming aliasing and ordering which have already been checked.
3667       return true;
3668     // Storing the same value on every iteration.
3669     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3670   };
3671 
3672   auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3673     InstWidening WideningDecision = getWideningDecision(I, VF);
3674     assert(WideningDecision != CM_Unknown &&
3675            "Widening decision should be ready at this moment");
3676 
3677     if (IsUniformMemOpUse(I))
3678       return true;
3679 
3680     return (WideningDecision == CM_Widen ||
3681             WideningDecision == CM_Widen_Reverse ||
3682             WideningDecision == CM_Interleave);
3683   };
3684 
3685   // Returns true if Ptr is the pointer operand of a memory access instruction
3686   // I, I is known to not require scalarization, and the pointer is not also
3687   // stored.
3688   auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3689     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3690       return false;
3691     return getLoadStorePointerOperand(I) == Ptr &&
3692            (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3693   };
3694 
3695   // Holds a list of values which are known to have at least one uniform use.
3696   // Note that there may be other uses which aren't uniform.  A "uniform use"
3697   // here is something which only demands lane 0 of the unrolled iterations;
3698   // it does not imply that all lanes produce the same value (e.g. this is not
3699   // the usual meaning of uniform)
3700   SetVector<Value *> HasUniformUse;
3701 
3702   // Scan the loop for instructions which are either a) known to have only
3703   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3704   for (auto *BB : TheLoop->blocks())
3705     for (auto &I : *BB) {
3706       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3707         switch (II->getIntrinsicID()) {
3708         case Intrinsic::sideeffect:
3709         case Intrinsic::experimental_noalias_scope_decl:
3710         case Intrinsic::assume:
3711         case Intrinsic::lifetime_start:
3712         case Intrinsic::lifetime_end:
3713           if (TheLoop->hasLoopInvariantOperands(&I))
3714             AddToWorklistIfAllowed(&I);
3715           break;
3716         default:
3717           break;
3718         }
3719       }
3720 
3721       // ExtractValue instructions must be uniform, because the operands are
3722       // known to be loop-invariant.
3723       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3724         assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3725                "Expected aggregate value to be loop invariant");
3726         AddToWorklistIfAllowed(EVI);
3727         continue;
3728       }
3729 
3730       // If there's no pointer operand, there's nothing to do.
3731       auto *Ptr = getLoadStorePointerOperand(&I);
3732       if (!Ptr)
3733         continue;
3734 
3735       if (IsUniformMemOpUse(&I))
3736         AddToWorklistIfAllowed(&I);
3737 
3738       if (IsVectorizedMemAccessUse(&I, Ptr))
3739         HasUniformUse.insert(Ptr);
3740     }
3741 
3742   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3743   // demanding) users.  Since loops are assumed to be in LCSSA form, this
3744   // disallows uses outside the loop as well.
3745   for (auto *V : HasUniformUse) {
3746     if (IsOutOfScope(V))
3747       continue;
3748     auto *I = cast<Instruction>(V);
3749     bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3750       auto *UI = cast<Instruction>(U);
3751       return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3752     });
3753     if (UsersAreMemAccesses)
3754       AddToWorklistIfAllowed(I);
3755   }
3756 
3757   // Expand Worklist in topological order: whenever a new instruction
3758   // is added , its users should be already inside Worklist.  It ensures
3759   // a uniform instruction will only be used by uniform instructions.
3760   unsigned Idx = 0;
3761   while (Idx != Worklist.size()) {
3762     Instruction *I = Worklist[Idx++];
3763 
3764     for (auto *OV : I->operand_values()) {
3765       // isOutOfScope operands cannot be uniform instructions.
3766       if (IsOutOfScope(OV))
3767         continue;
3768       // First order recurrence Phi's should typically be considered
3769       // non-uniform.
3770       auto *OP = dyn_cast<PHINode>(OV);
3771       if (OP && Legal->isFixedOrderRecurrence(OP))
3772         continue;
3773       // If all the users of the operand are uniform, then add the
3774       // operand into the uniform worklist.
3775       auto *OI = cast<Instruction>(OV);
3776       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3777             auto *J = cast<Instruction>(U);
3778             return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3779           }))
3780         AddToWorklistIfAllowed(OI);
3781     }
3782   }
3783 
3784   // For an instruction to be added into Worklist above, all its users inside
3785   // the loop should also be in Worklist. However, this condition cannot be
3786   // true for phi nodes that form a cyclic dependence. We must process phi
3787   // nodes separately. An induction variable will remain uniform if all users
3788   // of the induction variable and induction variable update remain uniform.
3789   // The code below handles both pointer and non-pointer induction variables.
3790   BasicBlock *Latch = TheLoop->getLoopLatch();
3791   for (const auto &Induction : Legal->getInductionVars()) {
3792     auto *Ind = Induction.first;
3793     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3794 
3795     // Determine if all users of the induction variable are uniform after
3796     // vectorization.
3797     bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3798       auto *I = cast<Instruction>(U);
3799       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3800              IsVectorizedMemAccessUse(I, Ind);
3801     });
3802     if (!UniformInd)
3803       continue;
3804 
3805     // Determine if all users of the induction variable update instruction are
3806     // uniform after vectorization.
3807     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3808       auto *I = cast<Instruction>(U);
3809       return I == Ind || Worklist.count(I) ||
3810              IsVectorizedMemAccessUse(I, IndUpdate);
3811     });
3812     if (!UniformIndUpdate)
3813       continue;
3814 
3815     // The induction variable and its update instruction will remain uniform.
3816     AddToWorklistIfAllowed(Ind);
3817     AddToWorklistIfAllowed(IndUpdate);
3818   }
3819 
3820   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3821 }
3822 
3823 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3824   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3825 
3826   if (Legal->getRuntimePointerChecking()->Need) {
3827     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3828         "runtime pointer checks needed. Enable vectorization of this "
3829         "loop with '#pragma clang loop vectorize(enable)' when "
3830         "compiling with -Os/-Oz",
3831         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3832     return true;
3833   }
3834 
3835   if (!PSE.getPredicate().isAlwaysTrue()) {
3836     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3837         "runtime SCEV checks needed. Enable vectorization of this "
3838         "loop with '#pragma clang loop vectorize(enable)' when "
3839         "compiling with -Os/-Oz",
3840         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3841     return true;
3842   }
3843 
3844   // FIXME: Avoid specializing for stride==1 instead of bailing out.
3845   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3846     reportVectorizationFailure("Runtime stride check for small trip count",
3847         "runtime stride == 1 checks needed. Enable vectorization of "
3848         "this loop without such check by compiling with -Os/-Oz",
3849         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3850     return true;
3851   }
3852 
3853   return false;
3854 }
3855 
3856 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3857   if (IsScalableVectorizationAllowed)
3858     return *IsScalableVectorizationAllowed;
3859 
3860   IsScalableVectorizationAllowed = false;
3861   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3862     return false;
3863 
3864   if (Hints->isScalableVectorizationDisabled()) {
3865     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3866                             "ScalableVectorizationDisabled", ORE, TheLoop);
3867     return false;
3868   }
3869 
3870   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3871 
3872   auto MaxScalableVF = ElementCount::getScalable(
3873       std::numeric_limits<ElementCount::ScalarTy>::max());
3874 
3875   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3876   // FIXME: While for scalable vectors this is currently sufficient, this should
3877   // be replaced by a more detailed mechanism that filters out specific VFs,
3878   // instead of invalidating vectorization for a whole set of VFs based on the
3879   // MaxVF.
3880 
3881   // Disable scalable vectorization if the loop contains unsupported reductions.
3882   if (!canVectorizeReductions(MaxScalableVF)) {
3883     reportVectorizationInfo(
3884         "Scalable vectorization not supported for the reduction "
3885         "operations found in this loop.",
3886         "ScalableVFUnfeasible", ORE, TheLoop);
3887     return false;
3888   }
3889 
3890   // Disable scalable vectorization if the loop contains any instructions
3891   // with element types not supported for scalable vectors.
3892   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3893         return !Ty->isVoidTy() &&
3894                !this->TTI.isElementTypeLegalForScalableVector(Ty);
3895       })) {
3896     reportVectorizationInfo("Scalable vectorization is not supported "
3897                             "for all element types found in this loop.",
3898                             "ScalableVFUnfeasible", ORE, TheLoop);
3899     return false;
3900   }
3901 
3902   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3903     reportVectorizationInfo("The target does not provide maximum vscale value "
3904                             "for safe distance analysis.",
3905                             "ScalableVFUnfeasible", ORE, TheLoop);
3906     return false;
3907   }
3908 
3909   IsScalableVectorizationAllowed = true;
3910   return true;
3911 }
3912 
3913 ElementCount
3914 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3915   if (!isScalableVectorizationAllowed())
3916     return ElementCount::getScalable(0);
3917 
3918   auto MaxScalableVF = ElementCount::getScalable(
3919       std::numeric_limits<ElementCount::ScalarTy>::max());
3920   if (Legal->isSafeForAnyVectorWidth())
3921     return MaxScalableVF;
3922 
3923   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3924   // Limit MaxScalableVF by the maximum safe dependence distance.
3925   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3926 
3927   if (!MaxScalableVF)
3928     reportVectorizationInfo(
3929         "Max legal vector width too small, scalable vectorization "
3930         "unfeasible.",
3931         "ScalableVFUnfeasible", ORE, TheLoop);
3932 
3933   return MaxScalableVF;
3934 }
3935 
3936 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3937     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3938   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3939   unsigned SmallestType, WidestType;
3940   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3941 
3942   // Get the maximum safe dependence distance in bits computed by LAA.
3943   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3944   // the memory accesses that is most restrictive (involved in the smallest
3945   // dependence distance).
3946   unsigned MaxSafeElements =
3947       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3948 
3949   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3950   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3951   if (!Legal->isSafeForAnyVectorWidth())
3952     this->MaxSafeElements = MaxSafeElements;
3953 
3954   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3955                     << ".\n");
3956   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3957                     << ".\n");
3958 
3959   // First analyze the UserVF, fall back if the UserVF should be ignored.
3960   if (UserVF) {
3961     auto MaxSafeUserVF =
3962         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3963 
3964     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3965       // If `VF=vscale x N` is safe, then so is `VF=N`
3966       if (UserVF.isScalable())
3967         return FixedScalableVFPair(
3968             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3969 
3970       return UserVF;
3971     }
3972 
3973     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3974 
3975     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3976     // is better to ignore the hint and let the compiler choose a suitable VF.
3977     if (!UserVF.isScalable()) {
3978       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3979                         << " is unsafe, clamping to max safe VF="
3980                         << MaxSafeFixedVF << ".\n");
3981       ORE->emit([&]() {
3982         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3983                                           TheLoop->getStartLoc(),
3984                                           TheLoop->getHeader())
3985                << "User-specified vectorization factor "
3986                << ore::NV("UserVectorizationFactor", UserVF)
3987                << " is unsafe, clamping to maximum safe vectorization factor "
3988                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3989       });
3990       return MaxSafeFixedVF;
3991     }
3992 
3993     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3994       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3995                         << " is ignored because scalable vectors are not "
3996                            "available.\n");
3997       ORE->emit([&]() {
3998         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3999                                           TheLoop->getStartLoc(),
4000                                           TheLoop->getHeader())
4001                << "User-specified vectorization factor "
4002                << ore::NV("UserVectorizationFactor", UserVF)
4003                << " is ignored because the target does not support scalable "
4004                   "vectors. The compiler will pick a more suitable value.";
4005       });
4006     } else {
4007       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4008                         << " is unsafe. Ignoring scalable UserVF.\n");
4009       ORE->emit([&]() {
4010         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4011                                           TheLoop->getStartLoc(),
4012                                           TheLoop->getHeader())
4013                << "User-specified vectorization factor "
4014                << ore::NV("UserVectorizationFactor", UserVF)
4015                << " is unsafe. Ignoring the hint to let the compiler pick a "
4016                   "more suitable value.";
4017       });
4018     }
4019   }
4020 
4021   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4022                     << " / " << WidestType << " bits.\n");
4023 
4024   FixedScalableVFPair Result(ElementCount::getFixed(1),
4025                              ElementCount::getScalable(0));
4026   if (auto MaxVF =
4027           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4028                                   MaxSafeFixedVF, FoldTailByMasking))
4029     Result.FixedVF = MaxVF;
4030 
4031   if (auto MaxVF =
4032           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4033                                   MaxSafeScalableVF, FoldTailByMasking))
4034     if (MaxVF.isScalable()) {
4035       Result.ScalableVF = MaxVF;
4036       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4037                         << "\n");
4038     }
4039 
4040   return Result;
4041 }
4042 
4043 FixedScalableVFPair
4044 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4045   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4046     // TODO: It may be useful to do since it's still likely to be dynamically
4047     // uniform if the target can skip.
4048     reportVectorizationFailure(
4049         "Not inserting runtime ptr check for divergent target",
4050         "runtime pointer checks needed. Not enabled for divergent target",
4051         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4052     return FixedScalableVFPair::getNone();
4053   }
4054 
4055   ScalarEvolution *SE = PSE.getSE();
4056   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
4057   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
4058   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4059   if (TC != MaxTC)
4060     LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
4061   if (TC == 1) {
4062     reportVectorizationFailure("Single iteration (non) loop",
4063         "loop trip count is one, irrelevant for vectorization",
4064         "SingleIterationLoop", ORE, TheLoop);
4065     return FixedScalableVFPair::getNone();
4066   }
4067 
4068   // If BTC matches the widest induction type and is -1 then the trip count
4069   // computation will wrap to 0 and the vector trip count will be 0. Do not try
4070   // to vectorize.
4071   const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
4072   if (!isa<SCEVCouldNotCompute>(BTC) &&
4073       BTC->getType()->getScalarSizeInBits() >=
4074           Legal->getWidestInductionType()->getScalarSizeInBits() &&
4075       SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,
4076                            SE->getMinusOne(BTC->getType()))) {
4077     reportVectorizationFailure(
4078         "Trip count computation wrapped",
4079         "backedge-taken count is -1, loop trip count wrapped to 0",
4080         "TripCountWrapped", ORE, TheLoop);
4081     return FixedScalableVFPair::getNone();
4082   }
4083 
4084   switch (ScalarEpilogueStatus) {
4085   case CM_ScalarEpilogueAllowed:
4086     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4087   case CM_ScalarEpilogueNotAllowedUsePredicate:
4088     [[fallthrough]];
4089   case CM_ScalarEpilogueNotNeededUsePredicate:
4090     LLVM_DEBUG(
4091         dbgs() << "LV: vector predicate hint/switch found.\n"
4092                << "LV: Not allowing scalar epilogue, creating predicated "
4093                << "vector loop.\n");
4094     break;
4095   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4096     // fallthrough as a special case of OptForSize
4097   case CM_ScalarEpilogueNotAllowedOptSize:
4098     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4099       LLVM_DEBUG(
4100           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4101     else
4102       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4103                         << "count.\n");
4104 
4105     // Bail if runtime checks are required, which are not good when optimising
4106     // for size.
4107     if (runtimeChecksRequired())
4108       return FixedScalableVFPair::getNone();
4109 
4110     break;
4111   }
4112 
4113   // The only loops we can vectorize without a scalar epilogue, are loops with
4114   // a bottom-test and a single exiting block. We'd have to handle the fact
4115   // that not every instruction executes on the last iteration.  This will
4116   // require a lane mask which varies through the vector loop body.  (TODO)
4117   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4118     // If there was a tail-folding hint/switch, but we can't fold the tail by
4119     // masking, fallback to a vectorization with a scalar epilogue.
4120     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4121       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4122                            "scalar epilogue instead.\n");
4123       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4124       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4125     }
4126     return FixedScalableVFPair::getNone();
4127   }
4128 
4129   // Now try the tail folding
4130 
4131   // Invalidate interleave groups that require an epilogue if we can't mask
4132   // the interleave-group.
4133   if (!useMaskedInterleavedAccesses(TTI)) {
4134     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4135            "No decisions should have been taken at this point");
4136     // Note: There is no need to invalidate any cost modeling decisions here, as
4137     // none were taken so far.
4138     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4139   }
4140 
4141   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4142 
4143   // Avoid tail folding if the trip count is known to be a multiple of any VF
4144   // we choose.
4145   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4146       MaxFactors.FixedVF.getFixedValue();
4147   if (MaxFactors.ScalableVF) {
4148     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4149     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4150       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4151           *MaxPowerOf2RuntimeVF,
4152           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4153     } else
4154       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4155   }
4156 
4157   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4158     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4159            "MaxFixedVF must be a power of 2");
4160     unsigned MaxVFtimesIC =
4161         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4162     ScalarEvolution *SE = PSE.getSE();
4163     // Currently only loops with countable exits are vectorized, but calling
4164     // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4165     // uncountable exits whilst also ensuring the symbolic maximum and known
4166     // back-edge taken count remain identical for loops with countable exits.
4167     const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4168     assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4169            "Invalid loop count");
4170     const SCEV *ExitCount = SE->getAddExpr(
4171         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4172     const SCEV *Rem = SE->getURemExpr(
4173         SE->applyLoopGuards(ExitCount, TheLoop),
4174         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4175     if (Rem->isZero()) {
4176       // Accept MaxFixedVF if we do not have a tail.
4177       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4178       return MaxFactors;
4179     }
4180   }
4181 
4182   // If we don't know the precise trip count, or if the trip count that we
4183   // found modulo the vectorization factor is not zero, try to fold the tail
4184   // by masking.
4185   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4186   setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4187   if (foldTailByMasking()) {
4188     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4189       LLVM_DEBUG(
4190           dbgs()
4191           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4192              "try to generate VP Intrinsics with scalable vector "
4193              "factors only.\n");
4194       // Tail folded loop using VP intrinsics restricts the VF to be scalable
4195       // for now.
4196       // TODO: extend it for fixed vectors, if required.
4197       assert(MaxFactors.ScalableVF.isScalable() &&
4198              "Expected scalable vector factor.");
4199 
4200       MaxFactors.FixedVF = ElementCount::getFixed(1);
4201     }
4202     return MaxFactors;
4203   }
4204 
4205   // If there was a tail-folding hint/switch, but we can't fold the tail by
4206   // masking, fallback to a vectorization with a scalar epilogue.
4207   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4208     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4209                          "scalar epilogue instead.\n");
4210     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4211     return MaxFactors;
4212   }
4213 
4214   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4215     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4216     return FixedScalableVFPair::getNone();
4217   }
4218 
4219   if (TC == 0) {
4220     reportVectorizationFailure(
4221         "unable to calculate the loop count due to complex control flow",
4222         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4223     return FixedScalableVFPair::getNone();
4224   }
4225 
4226   reportVectorizationFailure(
4227       "Cannot optimize for size and vectorize at the same time.",
4228       "cannot optimize for size and vectorize at the same time. "
4229       "Enable vectorization of this loop with '#pragma clang loop "
4230       "vectorize(enable)' when compiling with -Os/-Oz",
4231       "NoTailLoopWithOptForSize", ORE, TheLoop);
4232   return FixedScalableVFPair::getNone();
4233 }
4234 
4235 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4236     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4237     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4238   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4239   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4240       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4241                            : TargetTransformInfo::RGK_FixedWidthVector);
4242 
4243   // Convenience function to return the minimum of two ElementCounts.
4244   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4245     assert((LHS.isScalable() == RHS.isScalable()) &&
4246            "Scalable flags must match");
4247     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4248   };
4249 
4250   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4251   // Note that both WidestRegister and WidestType may not be a powers of 2.
4252   auto MaxVectorElementCount = ElementCount::get(
4253       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4254       ComputeScalableMaxVF);
4255   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4256   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4257                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4258 
4259   if (!MaxVectorElementCount) {
4260     LLVM_DEBUG(dbgs() << "LV: The target has no "
4261                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4262                       << " vector registers.\n");
4263     return ElementCount::getFixed(1);
4264   }
4265 
4266   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4267   if (MaxVectorElementCount.isScalable() &&
4268       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4269     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4270     auto Min = Attr.getVScaleRangeMin();
4271     WidestRegisterMinEC *= Min;
4272   }
4273 
4274   // When a scalar epilogue is required, at least one iteration of the scalar
4275   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4276   // max VF that results in a dead vector loop.
4277   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4278     MaxTripCount -= 1;
4279 
4280   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4281       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4282     // If upper bound loop trip count (TC) is known at compile time there is no
4283     // point in choosing VF greater than TC (as done in the loop below). Select
4284     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4285     // scalable, we only fall back on a fixed VF when the TC is less than or
4286     // equal to the known number of lanes.
4287     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4288     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4289                          "exceeding the constant trip count: "
4290                       << ClampedUpperTripCount << "\n");
4291     return ElementCount::get(
4292         ClampedUpperTripCount,
4293         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4294   }
4295 
4296   TargetTransformInfo::RegisterKind RegKind =
4297       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4298                            : TargetTransformInfo::RGK_FixedWidthVector;
4299   ElementCount MaxVF = MaxVectorElementCount;
4300   if (MaximizeBandwidth ||
4301       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4302        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4303         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4304     auto MaxVectorElementCountMaxBW = ElementCount::get(
4305         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4306         ComputeScalableMaxVF);
4307     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4308 
4309     // Collect all viable vectorization factors larger than the default MaxVF
4310     // (i.e. MaxVectorElementCount).
4311     SmallVector<ElementCount, 8> VFs;
4312     for (ElementCount VS = MaxVectorElementCount * 2;
4313          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4314       VFs.push_back(VS);
4315 
4316     // For each VF calculate its register usage.
4317     auto RUs = calculateRegisterUsage(VFs);
4318 
4319     // Select the largest VF which doesn't require more registers than existing
4320     // ones.
4321     for (int I = RUs.size() - 1; I >= 0; --I) {
4322       const auto &MLU = RUs[I].MaxLocalUsers;
4323       if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4324             return LU.second <= TTI.getNumberOfRegisters(LU.first);
4325           })) {
4326         MaxVF = VFs[I];
4327         break;
4328       }
4329     }
4330     if (ElementCount MinVF =
4331             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4332       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4333         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4334                           << ") with target's minimum: " << MinVF << '\n');
4335         MaxVF = MinVF;
4336       }
4337     }
4338 
4339     // Invalidate any widening decisions we might have made, in case the loop
4340     // requires prediction (decided later), but we have already made some
4341     // load/store widening decisions.
4342     invalidateCostModelingDecisions();
4343   }
4344   return MaxVF;
4345 }
4346 
4347 /// Convenience function that returns the value of vscale_range iff
4348 /// vscale_range.min == vscale_range.max or otherwise returns the value
4349 /// returned by the corresponding TTI method.
4350 static std::optional<unsigned>
4351 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4352   const Function *Fn = L->getHeader()->getParent();
4353   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4354     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4355     auto Min = Attr.getVScaleRangeMin();
4356     auto Max = Attr.getVScaleRangeMax();
4357     if (Max && Min == Max)
4358       return Max;
4359   }
4360 
4361   return TTI.getVScaleForTuning();
4362 }
4363 
4364 /// This function attempts to return a value that represents the vectorization
4365 /// factor at runtime. For fixed-width VFs we know this precisely at compile
4366 /// time, but for scalable VFs we calculate it based on an estimate of the
4367 /// vscale value.
4368 static unsigned getEstimatedRuntimeVF(const Loop *L,
4369                                       const TargetTransformInfo &TTI,
4370                                       ElementCount VF) {
4371   unsigned EstimatedVF = VF.getKnownMinValue();
4372   if (VF.isScalable())
4373     if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4374       EstimatedVF *= *VScale;
4375   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4376   return EstimatedVF;
4377 }
4378 
4379 bool LoopVectorizationPlanner::isMoreProfitable(
4380     const VectorizationFactor &A, const VectorizationFactor &B,
4381     const unsigned MaxTripCount) const {
4382   InstructionCost CostA = A.Cost;
4383   InstructionCost CostB = B.Cost;
4384 
4385   // Improve estimate for the vector width if it is scalable.
4386   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4387   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4388   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4389     if (A.Width.isScalable())
4390       EstimatedWidthA *= *VScale;
4391     if (B.Width.isScalable())
4392       EstimatedWidthB *= *VScale;
4393   }
4394 
4395   // Assume vscale may be larger than 1 (or the value being tuned for),
4396   // so that scalable vectorization is slightly favorable over fixed-width
4397   // vectorization.
4398   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4399                         A.Width.isScalable() && !B.Width.isScalable();
4400 
4401   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4402                                 const InstructionCost &RHS) {
4403     return PreferScalable ? LHS <= RHS : LHS < RHS;
4404   };
4405 
4406   // To avoid the need for FP division:
4407   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4408   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4409   if (!MaxTripCount)
4410     return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4411 
4412   auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4413                                            InstructionCost VectorCost,
4414                                            InstructionCost ScalarCost) {
4415     // If the trip count is a known (possibly small) constant, the trip count
4416     // will be rounded up to an integer number of iterations under
4417     // FoldTailByMasking. The total cost in that case will be
4418     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4419     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4420     // some extra overheads, but for the purpose of comparing the costs of
4421     // different VFs we can use this to compare the total loop-body cost
4422     // expected after vectorization.
4423     if (CM.foldTailByMasking())
4424       return VectorCost * divideCeil(MaxTripCount, VF);
4425     return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4426   };
4427 
4428   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4429   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4430   return CmpFn(RTCostA, RTCostB);
4431 }
4432 
4433 bool LoopVectorizationPlanner::isMoreProfitable(
4434     const VectorizationFactor &A, const VectorizationFactor &B) const {
4435   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4436   return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4437 }
4438 
4439 void LoopVectorizationPlanner::emitInvalidCostRemarks(
4440     OptimizationRemarkEmitter *ORE) {
4441   using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4442   SmallVector<RecipeVFPair> InvalidCosts;
4443   for (const auto &Plan : VPlans) {
4444     for (ElementCount VF : Plan->vectorFactors()) {
4445       VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4446                             CM);
4447       precomputeCosts(*Plan, VF, CostCtx);
4448       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4449       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4450         for (auto &R : *VPBB) {
4451           if (!R.cost(VF, CostCtx).isValid())
4452             InvalidCosts.emplace_back(&R, VF);
4453         }
4454       }
4455     }
4456   }
4457   if (InvalidCosts.empty())
4458     return;
4459 
4460   // Emit a report of VFs with invalid costs in the loop.
4461 
4462   // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4463   DenseMap<VPRecipeBase *, unsigned> Numbering;
4464   unsigned I = 0;
4465   for (auto &Pair : InvalidCosts)
4466     if (!Numbering.count(Pair.first))
4467       Numbering[Pair.first] = I++;
4468 
4469   // Sort the list, first on recipe(number) then on VF.
4470   sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4471     if (Numbering[A.first] != Numbering[B.first])
4472       return Numbering[A.first] < Numbering[B.first];
4473     const auto &LHS = A.second;
4474     const auto &RHS = B.second;
4475     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4476            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4477   });
4478 
4479   // For a list of ordered recipe-VF pairs:
4480   //   [(load, VF1), (load, VF2), (store, VF1)]
4481   // group the recipes together to emit separate remarks for:
4482   //   load  (VF1, VF2)
4483   //   store (VF1)
4484   auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4485   auto Subset = ArrayRef<RecipeVFPair>();
4486   do {
4487     if (Subset.empty())
4488       Subset = Tail.take_front(1);
4489 
4490     VPRecipeBase *R = Subset.front().first;
4491 
4492     unsigned Opcode =
4493         TypeSwitch<const VPRecipeBase *, unsigned>(R)
4494             .Case<VPHeaderPHIRecipe>(
4495                 [](const auto *R) { return Instruction::PHI; })
4496             .Case<VPWidenSelectRecipe>(
4497                 [](const auto *R) { return Instruction::Select; })
4498             .Case<VPWidenStoreRecipe>(
4499                 [](const auto *R) { return Instruction::Store; })
4500             .Case<VPWidenLoadRecipe>(
4501                 [](const auto *R) { return Instruction::Load; })
4502             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4503                 [](const auto *R) { return Instruction::Call; })
4504             .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4505                   VPWidenCastRecipe>(
4506                 [](const auto *R) { return R->getOpcode(); })
4507             .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4508               return R->getStoredValues().empty() ? Instruction::Load
4509                                                   : Instruction::Store;
4510             });
4511 
4512     // If the next recipe is different, or if there are no other pairs,
4513     // emit a remark for the collated subset. e.g.
4514     //   [(load, VF1), (load, VF2))]
4515     // to emit:
4516     //  remark: invalid costs for 'load' at VF=(VF1, VF2)
4517     if (Subset == Tail || Tail[Subset.size()].first != R) {
4518       std::string OutString;
4519       raw_string_ostream OS(OutString);
4520       assert(!Subset.empty() && "Unexpected empty range");
4521       OS << "Recipe with invalid costs prevented vectorization at VF=(";
4522       for (const auto &Pair : Subset)
4523         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4524       OS << "):";
4525       if (Opcode == Instruction::Call) {
4526         StringRef Name = "";
4527         if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4528           Name = Int->getIntrinsicName();
4529         } else {
4530           auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4531           Function *CalledFn =
4532               WidenCall ? WidenCall->getCalledScalarFunction()
4533                         : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4534                                              ->getLiveInIRValue());
4535           Name = CalledFn->getName();
4536         }
4537         OS << " call to " << Name;
4538       } else
4539         OS << " " << Instruction::getOpcodeName(Opcode);
4540       reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4541                               R->getDebugLoc());
4542       Tail = Tail.drop_front(Subset.size());
4543       Subset = {};
4544     } else
4545       // Grow the subset by one element
4546       Subset = Tail.take_front(Subset.size() + 1);
4547   } while (!Tail.empty());
4548 }
4549 
4550 /// Check if any recipe of \p Plan will generate a vector value, which will be
4551 /// assigned a vector register.
4552 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4553                                 const TargetTransformInfo &TTI) {
4554   assert(VF.isVector() && "Checking a scalar VF?");
4555   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4556   DenseSet<VPRecipeBase *> EphemeralRecipes;
4557   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4558   // Set of already visited types.
4559   DenseSet<Type *> Visited;
4560   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4561            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4562     for (VPRecipeBase &R : *VPBB) {
4563       if (EphemeralRecipes.contains(&R))
4564         continue;
4565       // Continue early if the recipe is considered to not produce a vector
4566       // result. Note that this includes VPInstruction where some opcodes may
4567       // produce a vector, to preserve existing behavior as VPInstructions model
4568       // aspects not directly mapped to existing IR instructions.
4569       switch (R.getVPDefID()) {
4570       case VPDef::VPDerivedIVSC:
4571       case VPDef::VPScalarIVStepsSC:
4572       case VPDef::VPScalarCastSC:
4573       case VPDef::VPReplicateSC:
4574       case VPDef::VPInstructionSC:
4575       case VPDef::VPCanonicalIVPHISC:
4576       case VPDef::VPVectorPointerSC:
4577       case VPDef::VPReverseVectorPointerSC:
4578       case VPDef::VPExpandSCEVSC:
4579       case VPDef::VPEVLBasedIVPHISC:
4580       case VPDef::VPPredInstPHISC:
4581       case VPDef::VPBranchOnMaskSC:
4582         continue;
4583       case VPDef::VPReductionSC:
4584       case VPDef::VPActiveLaneMaskPHISC:
4585       case VPDef::VPWidenCallSC:
4586       case VPDef::VPWidenCanonicalIVSC:
4587       case VPDef::VPWidenCastSC:
4588       case VPDef::VPWidenGEPSC:
4589       case VPDef::VPWidenIntrinsicSC:
4590       case VPDef::VPWidenSC:
4591       case VPDef::VPWidenSelectSC:
4592       case VPDef::VPBlendSC:
4593       case VPDef::VPFirstOrderRecurrencePHISC:
4594       case VPDef::VPWidenPHISC:
4595       case VPDef::VPWidenIntOrFpInductionSC:
4596       case VPDef::VPWidenPointerInductionSC:
4597       case VPDef::VPReductionPHISC:
4598       case VPDef::VPInterleaveSC:
4599       case VPDef::VPWidenLoadEVLSC:
4600       case VPDef::VPWidenLoadSC:
4601       case VPDef::VPWidenStoreEVLSC:
4602       case VPDef::VPWidenStoreSC:
4603         break;
4604       default:
4605         llvm_unreachable("unhandled recipe");
4606       }
4607 
4608       auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4609         Type *VectorTy = toVectorTy(ScalarTy, VF);
4610         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4611         if (!NumLegalParts)
4612           return false;
4613         if (VF.isScalable()) {
4614           // <vscale x 1 x iN> is assumed to be profitable over iN because
4615           // scalable registers are a distinct register class from scalar
4616           // ones. If we ever find a target which wants to lower scalable
4617           // vectors back to scalars, we'll need to update this code to
4618           // explicitly ask TTI about the register class uses for each part.
4619           return NumLegalParts <= VF.getKnownMinValue();
4620         }
4621         // Two or more parts that share a register - are vectorized.
4622         return NumLegalParts < VF.getKnownMinValue();
4623       };
4624 
4625       // If no def nor is a store, e.g., branches, continue - no value to check.
4626       if (R.getNumDefinedValues() == 0 &&
4627           !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4628               &R))
4629         continue;
4630       // For multi-def recipes, currently only interleaved loads, suffice to
4631       // check first def only.
4632       // For stores check their stored value; for interleaved stores suffice
4633       // the check first stored value only. In all cases this is the second
4634       // operand.
4635       VPValue *ToCheck =
4636           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4637       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4638       if (!Visited.insert({ScalarTy}).second)
4639         continue;
4640       if (WillWiden(ScalarTy))
4641         return true;
4642     }
4643   }
4644 
4645   return false;
4646 }
4647 
4648 #ifndef NDEBUG
4649 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4650   InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4651   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4652   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4653   assert(any_of(VPlans,
4654                 [](std::unique_ptr<VPlan> &P) {
4655                   return P->hasVF(ElementCount::getFixed(1));
4656                 }) &&
4657          "Expected Scalar VF to be a candidate");
4658 
4659   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4660                                        ExpectedCost);
4661   VectorizationFactor ChosenFactor = ScalarCost;
4662 
4663   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4664   if (ForceVectorization &&
4665       (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4666     // Ignore scalar width, because the user explicitly wants vectorization.
4667     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4668     // evaluation.
4669     ChosenFactor.Cost = InstructionCost::getMax();
4670   }
4671 
4672   for (auto &P : VPlans) {
4673     for (ElementCount VF : P->vectorFactors()) {
4674       // The cost for scalar VF=1 is already calculated, so ignore it.
4675       if (VF.isScalar())
4676         continue;
4677 
4678       InstructionCost C = CM.expectedCost(VF);
4679       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4680 
4681       unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4682       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4683                         << " costs: " << (Candidate.Cost / Width));
4684       if (VF.isScalable())
4685         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4686                           << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4687                           << ")");
4688       LLVM_DEBUG(dbgs() << ".\n");
4689 
4690       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4691         LLVM_DEBUG(
4692             dbgs()
4693             << "LV: Not considering vector loop of width " << VF
4694             << " because it will not generate any vector instructions.\n");
4695         continue;
4696       }
4697 
4698       if (isMoreProfitable(Candidate, ChosenFactor))
4699         ChosenFactor = Candidate;
4700     }
4701   }
4702 
4703   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4704     reportVectorizationFailure(
4705         "There are conditional stores.",
4706         "store that is conditionally executed prevents vectorization",
4707         "ConditionalStore", ORE, OrigLoop);
4708     ChosenFactor = ScalarCost;
4709   }
4710 
4711   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4712                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4713              << "LV: Vectorization seems to be not beneficial, "
4714              << "but was forced by a user.\n");
4715   return ChosenFactor;
4716 }
4717 #endif
4718 
4719 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4720     ElementCount VF) const {
4721   // Cross iteration phis such as reductions need special handling and are
4722   // currently unsupported.
4723   if (any_of(OrigLoop->getHeader()->phis(),
4724              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4725     return false;
4726 
4727   // Phis with uses outside of the loop require special handling and are
4728   // currently unsupported.
4729   for (const auto &Entry : Legal->getInductionVars()) {
4730     // Look for uses of the value of the induction at the last iteration.
4731     Value *PostInc =
4732         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4733     for (User *U : PostInc->users())
4734       if (!OrigLoop->contains(cast<Instruction>(U)))
4735         return false;
4736     // Look for uses of penultimate value of the induction.
4737     for (User *U : Entry.first->users())
4738       if (!OrigLoop->contains(cast<Instruction>(U)))
4739         return false;
4740   }
4741 
4742   // Epilogue vectorization code has not been auditted to ensure it handles
4743   // non-latch exits properly.  It may be fine, but it needs auditted and
4744   // tested.
4745   // TODO: Add support for loops with an early exit.
4746   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4747     return false;
4748 
4749   return true;
4750 }
4751 
4752 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4753     const ElementCount VF, const unsigned IC) const {
4754   // FIXME: We need a much better cost-model to take different parameters such
4755   // as register pressure, code size increase and cost of extra branches into
4756   // account. For now we apply a very crude heuristic and only consider loops
4757   // with vectorization factors larger than a certain value.
4758 
4759   // Allow the target to opt out entirely.
4760   if (!TTI.preferEpilogueVectorization())
4761     return false;
4762 
4763   // We also consider epilogue vectorization unprofitable for targets that don't
4764   // consider interleaving beneficial (eg. MVE).
4765   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4766     return false;
4767 
4768   // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4769   // VFs when deciding profitability.
4770   // See related "TODO: extend to support scalable VFs." in
4771   // selectEpilogueVectorizationFactor.
4772   unsigned Multiplier = VF.isFixed() ? IC : 1;
4773   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4774                                 ? EpilogueVectorizationMinVF
4775                                 : TTI.getEpilogueVectorizationMinVF();
4776   return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4777 }
4778 
4779 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4780     const ElementCount MainLoopVF, unsigned IC) {
4781   VectorizationFactor Result = VectorizationFactor::Disabled();
4782   if (!EnableEpilogueVectorization) {
4783     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4784     return Result;
4785   }
4786 
4787   if (!CM.isScalarEpilogueAllowed()) {
4788     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4789                          "epilogue is allowed.\n");
4790     return Result;
4791   }
4792 
4793   // Not really a cost consideration, but check for unsupported cases here to
4794   // simplify the logic.
4795   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4796     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4797                          "is not a supported candidate.\n");
4798     return Result;
4799   }
4800 
4801   if (EpilogueVectorizationForceVF > 1) {
4802     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4803     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4804     if (hasPlanWithVF(ForcedEC))
4805       return {ForcedEC, 0, 0};
4806 
4807     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4808                          "viable.\n");
4809     return Result;
4810   }
4811 
4812   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4813       OrigLoop->getHeader()->getParent()->hasMinSize()) {
4814     LLVM_DEBUG(
4815         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4816     return Result;
4817   }
4818 
4819   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4820     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4821                          "this loop\n");
4822     return Result;
4823   }
4824 
4825   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4826   // the main loop handles 8 lanes per iteration. We could still benefit from
4827   // vectorizing the epilogue loop with VF=4.
4828   ElementCount EstimatedRuntimeVF =
4829       ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4830 
4831   ScalarEvolution &SE = *PSE.getSE();
4832   Type *TCType = Legal->getWidestInductionType();
4833   const SCEV *RemainingIterations = nullptr;
4834   unsigned MaxTripCount = 0;
4835   for (auto &NextVF : ProfitableVFs) {
4836     // Skip candidate VFs without a corresponding VPlan.
4837     if (!hasPlanWithVF(NextVF.Width))
4838       continue;
4839 
4840     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4841     // vectors) or > the VF of the main loop (fixed vectors).
4842     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4843          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4844         (NextVF.Width.isScalable() &&
4845          ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4846         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4847          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4848       continue;
4849 
4850     // If NextVF is greater than the number of remaining iterations, the
4851     // epilogue loop would be dead. Skip such factors.
4852     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4853       // TODO: extend to support scalable VFs.
4854       if (!RemainingIterations) {
4855         const SCEV *TC = vputils::getSCEVExprForVPValue(
4856             getPlanFor(NextVF.Width).getTripCount(), SE);
4857         assert(!isa<SCEVCouldNotCompute>(TC) &&
4858                "Trip count SCEV must be computable");
4859         RemainingIterations = SE.getURemExpr(
4860             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4861         MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4862         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4863                                 SE.getConstant(TCType, MaxTripCount))) {
4864           MaxTripCount =
4865               SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4866         }
4867         LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4868                           << MaxTripCount << "\n");
4869       }
4870       if (SE.isKnownPredicate(
4871               CmpInst::ICMP_UGT,
4872               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4873               RemainingIterations))
4874         continue;
4875     }
4876 
4877     if (Result.Width.isScalar() ||
4878         isMoreProfitable(NextVF, Result, MaxTripCount))
4879       Result = NextVF;
4880   }
4881 
4882   if (Result != VectorizationFactor::Disabled())
4883     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4884                       << Result.Width << "\n");
4885   return Result;
4886 }
4887 
4888 std::pair<unsigned, unsigned>
4889 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4890   unsigned MinWidth = -1U;
4891   unsigned MaxWidth = 8;
4892   const DataLayout &DL = TheFunction->getDataLayout();
4893   // For in-loop reductions, no element types are added to ElementTypesInLoop
4894   // if there are no loads/stores in the loop. In this case, check through the
4895   // reduction variables to determine the maximum width.
4896   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4897     // Reset MaxWidth so that we can find the smallest type used by recurrences
4898     // in the loop.
4899     MaxWidth = -1U;
4900     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4901       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4902       // When finding the min width used by the recurrence we need to account
4903       // for casts on the input operands of the recurrence.
4904       MaxWidth = std::min<unsigned>(
4905           MaxWidth, std::min<unsigned>(
4906                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4907                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4908     }
4909   } else {
4910     for (Type *T : ElementTypesInLoop) {
4911       MinWidth = std::min<unsigned>(
4912           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4913       MaxWidth = std::max<unsigned>(
4914           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4915     }
4916   }
4917   return {MinWidth, MaxWidth};
4918 }
4919 
4920 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4921   ElementTypesInLoop.clear();
4922   // For each block.
4923   for (BasicBlock *BB : TheLoop->blocks()) {
4924     // For each instruction in the loop.
4925     for (Instruction &I : BB->instructionsWithoutDebug()) {
4926       Type *T = I.getType();
4927 
4928       // Skip ignored values.
4929       if (ValuesToIgnore.count(&I))
4930         continue;
4931 
4932       // Only examine Loads, Stores and PHINodes.
4933       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4934         continue;
4935 
4936       // Examine PHI nodes that are reduction variables. Update the type to
4937       // account for the recurrence type.
4938       if (auto *PN = dyn_cast<PHINode>(&I)) {
4939         if (!Legal->isReductionVariable(PN))
4940           continue;
4941         const RecurrenceDescriptor &RdxDesc =
4942             Legal->getReductionVars().find(PN)->second;
4943         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4944             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
4945                                       RdxDesc.getRecurrenceType(),
4946                                       TargetTransformInfo::ReductionFlags()))
4947           continue;
4948         T = RdxDesc.getRecurrenceType();
4949       }
4950 
4951       // Examine the stored values.
4952       if (auto *ST = dyn_cast<StoreInst>(&I))
4953         T = ST->getValueOperand()->getType();
4954 
4955       assert(T->isSized() &&
4956              "Expected the load/store/recurrence type to be sized");
4957 
4958       ElementTypesInLoop.insert(T);
4959     }
4960   }
4961 }
4962 
4963 unsigned
4964 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4965                                                   InstructionCost LoopCost) {
4966   // -- The interleave heuristics --
4967   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4968   // There are many micro-architectural considerations that we can't predict
4969   // at this level. For example, frontend pressure (on decode or fetch) due to
4970   // code size, or the number and capabilities of the execution ports.
4971   //
4972   // We use the following heuristics to select the interleave count:
4973   // 1. If the code has reductions, then we interleave to break the cross
4974   // iteration dependency.
4975   // 2. If the loop is really small, then we interleave to reduce the loop
4976   // overhead.
4977   // 3. We don't interleave if we think that we will spill registers to memory
4978   // due to the increased register pressure.
4979 
4980   if (!isScalarEpilogueAllowed())
4981     return 1;
4982 
4983   // Do not interleave if EVL is preferred and no User IC is specified.
4984   if (foldTailWithEVL()) {
4985     LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4986                          "Unroll factor forced to be 1.\n");
4987     return 1;
4988   }
4989 
4990   // We used the distance for the interleave count.
4991   if (!Legal->isSafeForAnyVectorWidth())
4992     return 1;
4993 
4994   // We don't attempt to perform interleaving for loops with uncountable early
4995   // exits because the VPInstruction::AnyOf code cannot currently handle
4996   // multiple parts.
4997   if (Legal->hasUncountableEarlyExit())
4998     return 1;
4999 
5000   auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
5001   const bool HasReductions = !Legal->getReductionVars().empty();
5002 
5003   // If we did not calculate the cost for VF (because the user selected the VF)
5004   // then we calculate the cost of VF here.
5005   if (LoopCost == 0) {
5006     LoopCost = expectedCost(VF);
5007     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5008 
5009     // Loop body is free and there is no need for interleaving.
5010     if (LoopCost == 0)
5011       return 1;
5012   }
5013 
5014   RegisterUsage R = calculateRegisterUsage({VF})[0];
5015   // We divide by these constants so assume that we have at least one
5016   // instruction that uses at least one register.
5017   for (auto &Pair : R.MaxLocalUsers) {
5018     Pair.second = std::max(Pair.second, 1U);
5019   }
5020 
5021   // We calculate the interleave count using the following formula.
5022   // Subtract the number of loop invariants from the number of available
5023   // registers. These registers are used by all of the interleaved instances.
5024   // Next, divide the remaining registers by the number of registers that is
5025   // required by the loop, in order to estimate how many parallel instances
5026   // fit without causing spills. All of this is rounded down if necessary to be
5027   // a power of two. We want power of two interleave count to simplify any
5028   // addressing operations or alignment considerations.
5029   // We also want power of two interleave counts to ensure that the induction
5030   // variable of the vector loop wraps to zero, when tail is folded by masking;
5031   // this currently happens when OptForSize, in which case IC is set to 1 above.
5032   unsigned IC = UINT_MAX;
5033 
5034   for (const auto &Pair : R.MaxLocalUsers) {
5035     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
5036     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5037                       << " registers of "
5038                       << TTI.getRegisterClassName(Pair.first)
5039                       << " register class\n");
5040     if (VF.isScalar()) {
5041       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5042         TargetNumRegisters = ForceTargetNumScalarRegs;
5043     } else {
5044       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5045         TargetNumRegisters = ForceTargetNumVectorRegs;
5046     }
5047     unsigned MaxLocalUsers = Pair.second;
5048     unsigned LoopInvariantRegs = 0;
5049     if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
5050       LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
5051 
5052     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5053                                      MaxLocalUsers);
5054     // Don't count the induction variable as interleaved.
5055     if (EnableIndVarRegisterHeur) {
5056       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5057                               std::max(1U, (MaxLocalUsers - 1)));
5058     }
5059 
5060     IC = std::min(IC, TmpIC);
5061   }
5062 
5063   // Clamp the interleave ranges to reasonable counts.
5064   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5065 
5066   // Check if the user has overridden the max.
5067   if (VF.isScalar()) {
5068     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5069       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5070   } else {
5071     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5072       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5073   }
5074 
5075   unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
5076   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5077   if (KnownTC > 0) {
5078     // At least one iteration must be scalar when this constraint holds. So the
5079     // maximum available iterations for interleaving is one less.
5080     unsigned AvailableTC =
5081         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5082 
5083     // If trip count is known we select between two prospective ICs, where
5084     // 1) the aggressive IC is capped by the trip count divided by VF
5085     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5086     // The final IC is selected in a way that the epilogue loop trip count is
5087     // minimized while maximizing the IC itself, so that we either run the
5088     // vector loop at least once if it generates a small epilogue loop, or else
5089     // we run the vector loop at least twice.
5090 
5091     unsigned InterleaveCountUB = bit_floor(
5092         std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5093     unsigned InterleaveCountLB = bit_floor(std::max(
5094         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5095     MaxInterleaveCount = InterleaveCountLB;
5096 
5097     if (InterleaveCountUB != InterleaveCountLB) {
5098       unsigned TailTripCountUB =
5099           (AvailableTC % (EstimatedVF * InterleaveCountUB));
5100       unsigned TailTripCountLB =
5101           (AvailableTC % (EstimatedVF * InterleaveCountLB));
5102       // If both produce same scalar tail, maximize the IC to do the same work
5103       // in fewer vector loop iterations
5104       if (TailTripCountUB == TailTripCountLB)
5105         MaxInterleaveCount = InterleaveCountUB;
5106     }
5107   } else if (BestKnownTC && *BestKnownTC > 0) {
5108     // At least one iteration must be scalar when this constraint holds. So the
5109     // maximum available iterations for interleaving is one less.
5110     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5111                                ? (*BestKnownTC) - 1
5112                                : *BestKnownTC;
5113 
5114     // If trip count is an estimated compile time constant, limit the
5115     // IC to be capped by the trip count divided by VF * 2, such that the vector
5116     // loop runs at least twice to make interleaving seem profitable when there
5117     // is an epilogue loop present. Since exact Trip count is not known we
5118     // choose to be conservative in our IC estimate.
5119     MaxInterleaveCount = bit_floor(std::max(
5120         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5121   }
5122 
5123   assert(MaxInterleaveCount > 0 &&
5124          "Maximum interleave count must be greater than 0");
5125 
5126   // Clamp the calculated IC to be between the 1 and the max interleave count
5127   // that the target and trip count allows.
5128   if (IC > MaxInterleaveCount)
5129     IC = MaxInterleaveCount;
5130   else
5131     // Make sure IC is greater than 0.
5132     IC = std::max(1u, IC);
5133 
5134   assert(IC > 0 && "Interleave count must be greater than 0.");
5135 
5136   // Interleave if we vectorized this loop and there is a reduction that could
5137   // benefit from interleaving.
5138   if (VF.isVector() && HasReductions) {
5139     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5140     return IC;
5141   }
5142 
5143   // For any scalar loop that either requires runtime checks or predication we
5144   // are better off leaving this to the unroller. Note that if we've already
5145   // vectorized the loop we will have done the runtime check and so interleaving
5146   // won't require further checks.
5147   bool ScalarInterleavingRequiresPredication =
5148       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5149          return Legal->blockNeedsPredication(BB);
5150        }));
5151   bool ScalarInterleavingRequiresRuntimePointerCheck =
5152       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5153 
5154   // We want to interleave small loops in order to reduce the loop overhead and
5155   // potentially expose ILP opportunities.
5156   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5157                     << "LV: IC is " << IC << '\n'
5158                     << "LV: VF is " << VF << '\n');
5159   const bool AggressivelyInterleaveReductions =
5160       TTI.enableAggressiveInterleaving(HasReductions);
5161   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5162       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5163     // We assume that the cost overhead is 1 and we use the cost model
5164     // to estimate the cost of the loop and interleave until the cost of the
5165     // loop overhead is about 5% of the cost of the loop.
5166     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5167                                         SmallLoopCost / *LoopCost.getValue()));
5168 
5169     // Interleave until store/load ports (estimated by max interleave count) are
5170     // saturated.
5171     unsigned NumStores = Legal->getNumStores();
5172     unsigned NumLoads = Legal->getNumLoads();
5173     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5174     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5175 
5176     // There is little point in interleaving for reductions containing selects
5177     // and compares when VF=1 since it may just create more overhead than it's
5178     // worth for loops with small trip counts. This is because we still have to
5179     // do the final reduction after the loop.
5180     bool HasSelectCmpReductions =
5181         HasReductions &&
5182         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5183           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5184           RecurKind RK = RdxDesc.getRecurrenceKind();
5185           return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5186                  RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5187         });
5188     if (HasSelectCmpReductions) {
5189       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5190       return 1;
5191     }
5192 
5193     // If we have a scalar reduction (vector reductions are already dealt with
5194     // by this point), we can increase the critical path length if the loop
5195     // we're interleaving is inside another loop. For tree-wise reductions
5196     // set the limit to 2, and for ordered reductions it's best to disable
5197     // interleaving entirely.
5198     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5199       bool HasOrderedReductions =
5200           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5201             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5202             return RdxDesc.isOrdered();
5203           });
5204       if (HasOrderedReductions) {
5205         LLVM_DEBUG(
5206             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5207         return 1;
5208       }
5209 
5210       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5211       SmallIC = std::min(SmallIC, F);
5212       StoresIC = std::min(StoresIC, F);
5213       LoadsIC = std::min(LoadsIC, F);
5214     }
5215 
5216     if (EnableLoadStoreRuntimeInterleave &&
5217         std::max(StoresIC, LoadsIC) > SmallIC) {
5218       LLVM_DEBUG(
5219           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5220       return std::max(StoresIC, LoadsIC);
5221     }
5222 
5223     // If there are scalar reductions and TTI has enabled aggressive
5224     // interleaving for reductions, we will interleave to expose ILP.
5225     if (VF.isScalar() && AggressivelyInterleaveReductions) {
5226       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5227       // Interleave no less than SmallIC but not as aggressive as the normal IC
5228       // to satisfy the rare situation when resources are too limited.
5229       return std::max(IC / 2, SmallIC);
5230     }
5231 
5232     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5233     return SmallIC;
5234   }
5235 
5236   // Interleave if this is a large loop (small loops are already dealt with by
5237   // this point) that could benefit from interleaving.
5238   if (AggressivelyInterleaveReductions) {
5239     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5240     return IC;
5241   }
5242 
5243   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5244   return 1;
5245 }
5246 
5247 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5248 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5249   // This function calculates the register usage by measuring the highest number
5250   // of values that are alive at a single location. Obviously, this is a very
5251   // rough estimation. We scan the loop in a topological order in order and
5252   // assign a number to each instruction. We use RPO to ensure that defs are
5253   // met before their users. We assume that each instruction that has in-loop
5254   // users starts an interval. We record every time that an in-loop value is
5255   // used, so we have a list of the first and last occurrences of each
5256   // instruction. Next, we transpose this data structure into a multi map that
5257   // holds the list of intervals that *end* at a specific location. This multi
5258   // map allows us to perform a linear search. We scan the instructions linearly
5259   // and record each time that a new interval starts, by placing it in a set.
5260   // If we find this value in the multi-map then we remove it from the set.
5261   // The max register usage is the maximum size of the set.
5262   // We also search for instructions that are defined outside the loop, but are
5263   // used inside the loop. We need this number separately from the max-interval
5264   // usage number because when we unroll, loop-invariant values do not take
5265   // more register.
5266   LoopBlocksDFS DFS(TheLoop);
5267   DFS.perform(LI);
5268 
5269   RegisterUsage RU;
5270 
5271   // Each 'key' in the map opens a new interval. The values
5272   // of the map are the index of the 'last seen' usage of the
5273   // instruction that is the key.
5274   using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5275 
5276   // Maps instruction to its index.
5277   SmallVector<Instruction *, 64> IdxToInstr;
5278   // Marks the end of each interval.
5279   IntervalMap EndPoint;
5280   // Saves the list of instruction indices that are used in the loop.
5281   SmallPtrSet<Instruction *, 8> Ends;
5282   // Saves the list of values that are used in the loop but are defined outside
5283   // the loop (not including non-instruction values such as arguments and
5284   // constants).
5285   SmallSetVector<Instruction *, 8> LoopInvariants;
5286 
5287   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5288     for (Instruction &I : BB->instructionsWithoutDebug()) {
5289       IdxToInstr.push_back(&I);
5290 
5291       // Save the end location of each USE.
5292       for (Value *U : I.operands()) {
5293         auto *Instr = dyn_cast<Instruction>(U);
5294 
5295         // Ignore non-instruction values such as arguments, constants, etc.
5296         // FIXME: Might need some motivation why these values are ignored. If
5297         // for example an argument is used inside the loop it will increase the
5298         // register pressure (so shouldn't we add it to LoopInvariants).
5299         if (!Instr)
5300           continue;
5301 
5302         // If this instruction is outside the loop then record it and continue.
5303         if (!TheLoop->contains(Instr)) {
5304           LoopInvariants.insert(Instr);
5305           continue;
5306         }
5307 
5308         // Overwrite previous end points.
5309         EndPoint[Instr] = IdxToInstr.size();
5310         Ends.insert(Instr);
5311       }
5312     }
5313   }
5314 
5315   // Saves the list of intervals that end with the index in 'key'.
5316   using InstrList = SmallVector<Instruction *, 2>;
5317   SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5318 
5319   // Transpose the EndPoints to a list of values that end at each index.
5320   for (auto &Interval : EndPoint)
5321     TransposeEnds[Interval.second].push_back(Interval.first);
5322 
5323   SmallPtrSet<Instruction *, 8> OpenIntervals;
5324   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5325   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5326 
5327   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5328 
5329   const auto &TTICapture = TTI;
5330   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5331     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5332         (VF.isScalable() &&
5333          !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5334       return 0;
5335     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5336   };
5337 
5338   for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5339     Instruction *I = IdxToInstr[Idx];
5340 
5341     // Remove all of the instructions that end at this location.
5342     InstrList &List = TransposeEnds[Idx];
5343     for (Instruction *ToRemove : List)
5344       OpenIntervals.erase(ToRemove);
5345 
5346     // Ignore instructions that are never used within the loop.
5347     if (!Ends.count(I))
5348       continue;
5349 
5350     // Skip ignored values.
5351     if (ValuesToIgnore.count(I))
5352       continue;
5353 
5354     collectInLoopReductions();
5355 
5356     // For each VF find the maximum usage of registers.
5357     for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5358       // Count the number of registers used, per register class, given all open
5359       // intervals.
5360       // Note that elements in this SmallMapVector will be default constructed
5361       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5362       // there is no previous entry for ClassID.
5363       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5364 
5365       if (VFs[J].isScalar()) {
5366         for (auto *Inst : OpenIntervals) {
5367           unsigned ClassID =
5368               TTI.getRegisterClassForType(false, Inst->getType());
5369           // FIXME: The target might use more than one register for the type
5370           // even in the scalar case.
5371           RegUsage[ClassID] += 1;
5372         }
5373       } else {
5374         collectUniformsAndScalars(VFs[J]);
5375         for (auto *Inst : OpenIntervals) {
5376           // Skip ignored values for VF > 1.
5377           if (VecValuesToIgnore.count(Inst))
5378             continue;
5379           if (isScalarAfterVectorization(Inst, VFs[J])) {
5380             unsigned ClassID =
5381                 TTI.getRegisterClassForType(false, Inst->getType());
5382             // FIXME: The target might use more than one register for the type
5383             // even in the scalar case.
5384             RegUsage[ClassID] += 1;
5385           } else {
5386             unsigned ClassID =
5387                 TTI.getRegisterClassForType(true, Inst->getType());
5388             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5389           }
5390         }
5391       }
5392 
5393       for (const auto &Pair : RegUsage) {
5394         auto &Entry = MaxUsages[J][Pair.first];
5395         Entry = std::max(Entry, Pair.second);
5396       }
5397     }
5398 
5399     LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5400                       << OpenIntervals.size() << '\n');
5401 
5402     // Add the current instruction to the list of open intervals.
5403     OpenIntervals.insert(I);
5404   }
5405 
5406   for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5407     // Note that elements in this SmallMapVector will be default constructed
5408     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5409     // there is no previous entry for ClassID.
5410     SmallMapVector<unsigned, unsigned, 4> Invariant;
5411 
5412     for (auto *Inst : LoopInvariants) {
5413       // FIXME: The target might use more than one register for the type
5414       // even in the scalar case.
5415       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5416         auto *I = cast<Instruction>(U);
5417         return TheLoop != LI->getLoopFor(I->getParent()) ||
5418                isScalarAfterVectorization(I, VFs[Idx]);
5419       });
5420 
5421       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5422       unsigned ClassID =
5423           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5424       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5425     }
5426 
5427     LLVM_DEBUG({
5428       dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5429       dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5430              << " item\n";
5431       for (const auto &pair : MaxUsages[Idx]) {
5432         dbgs() << "LV(REG): RegisterClass: "
5433                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5434                << " registers\n";
5435       }
5436       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5437              << " item\n";
5438       for (const auto &pair : Invariant) {
5439         dbgs() << "LV(REG): RegisterClass: "
5440                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5441                << " registers\n";
5442       }
5443     });
5444 
5445     RU.LoopInvariantRegs = Invariant;
5446     RU.MaxLocalUsers = MaxUsages[Idx];
5447     RUs[Idx] = RU;
5448   }
5449 
5450   return RUs;
5451 }
5452 
5453 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5454                                                            ElementCount VF) {
5455   // TODO: Cost model for emulated masked load/store is completely
5456   // broken. This hack guides the cost model to use an artificially
5457   // high enough value to practically disable vectorization with such
5458   // operations, except where previously deployed legality hack allowed
5459   // using very low cost values. This is to avoid regressions coming simply
5460   // from moving "masked load/store" check from legality to cost model.
5461   // Masked Load/Gather emulation was previously never allowed.
5462   // Limited number of Masked Store/Scatter emulation was allowed.
5463   assert((isPredicatedInst(I)) &&
5464          "Expecting a scalar emulated instruction");
5465   return isa<LoadInst>(I) ||
5466          (isa<StoreInst>(I) &&
5467           NumPredStores > NumberOfStoresToPredicate);
5468 }
5469 
5470 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5471   // If we aren't vectorizing the loop, or if we've already collected the
5472   // instructions to scalarize, there's nothing to do. Collection may already
5473   // have occurred if we have a user-selected VF and are now computing the
5474   // expected cost for interleaving.
5475   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5476     return;
5477 
5478   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5479   // not profitable to scalarize any instructions, the presence of VF in the
5480   // map will indicate that we've analyzed it already.
5481   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5482 
5483   PredicatedBBsAfterVectorization[VF].clear();
5484 
5485   // Find all the instructions that are scalar with predication in the loop and
5486   // determine if it would be better to not if-convert the blocks they are in.
5487   // If so, we also record the instructions to scalarize.
5488   for (BasicBlock *BB : TheLoop->blocks()) {
5489     if (!blockNeedsPredicationForAnyReason(BB))
5490       continue;
5491     for (Instruction &I : *BB)
5492       if (isScalarWithPredication(&I, VF)) {
5493         ScalarCostsTy ScalarCosts;
5494         // Do not apply discount logic for:
5495         // 1. Scalars after vectorization, as there will only be a single copy
5496         // of the instruction.
5497         // 2. Scalable VF, as that would lead to invalid scalarization costs.
5498         // 3. Emulated masked memrefs, if a hacked cost is needed.
5499         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5500             !useEmulatedMaskMemRefHack(&I, VF) &&
5501             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5502           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5503           // Check if we decided to scalarize a call. If so, update the widening
5504           // decision of the call to CM_Scalarize with the computed scalar cost.
5505           for (const auto &[I, _] : ScalarCosts) {
5506             auto *CI = dyn_cast<CallInst>(I);
5507             if (!CI || !CallWideningDecisions.contains({CI, VF}))
5508               continue;
5509             CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5510             CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5511           }
5512         }
5513         // Remember that BB will remain after vectorization.
5514         PredicatedBBsAfterVectorization[VF].insert(BB);
5515         for (auto *Pred : predecessors(BB)) {
5516           if (Pred->getSingleSuccessor() == BB)
5517             PredicatedBBsAfterVectorization[VF].insert(Pred);
5518         }
5519       }
5520   }
5521 }
5522 
5523 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5524     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5525   assert(!isUniformAfterVectorization(PredInst, VF) &&
5526          "Instruction marked uniform-after-vectorization will be predicated");
5527 
5528   // Initialize the discount to zero, meaning that the scalar version and the
5529   // vector version cost the same.
5530   InstructionCost Discount = 0;
5531 
5532   // Holds instructions to analyze. The instructions we visit are mapped in
5533   // ScalarCosts. Those instructions are the ones that would be scalarized if
5534   // we find that the scalar version costs less.
5535   SmallVector<Instruction *, 8> Worklist;
5536 
5537   // Returns true if the given instruction can be scalarized.
5538   auto CanBeScalarized = [&](Instruction *I) -> bool {
5539     // We only attempt to scalarize instructions forming a single-use chain
5540     // from the original predicated block that would otherwise be vectorized.
5541     // Although not strictly necessary, we give up on instructions we know will
5542     // already be scalar to avoid traversing chains that are unlikely to be
5543     // beneficial.
5544     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5545         isScalarAfterVectorization(I, VF))
5546       return false;
5547 
5548     // If the instruction is scalar with predication, it will be analyzed
5549     // separately. We ignore it within the context of PredInst.
5550     if (isScalarWithPredication(I, VF))
5551       return false;
5552 
5553     // If any of the instruction's operands are uniform after vectorization,
5554     // the instruction cannot be scalarized. This prevents, for example, a
5555     // masked load from being scalarized.
5556     //
5557     // We assume we will only emit a value for lane zero of an instruction
5558     // marked uniform after vectorization, rather than VF identical values.
5559     // Thus, if we scalarize an instruction that uses a uniform, we would
5560     // create uses of values corresponding to the lanes we aren't emitting code
5561     // for. This behavior can be changed by allowing getScalarValue to clone
5562     // the lane zero values for uniforms rather than asserting.
5563     for (Use &U : I->operands())
5564       if (auto *J = dyn_cast<Instruction>(U.get()))
5565         if (isUniformAfterVectorization(J, VF))
5566           return false;
5567 
5568     // Otherwise, we can scalarize the instruction.
5569     return true;
5570   };
5571 
5572   // Compute the expected cost discount from scalarizing the entire expression
5573   // feeding the predicated instruction. We currently only consider expressions
5574   // that are single-use instruction chains.
5575   Worklist.push_back(PredInst);
5576   while (!Worklist.empty()) {
5577     Instruction *I = Worklist.pop_back_val();
5578 
5579     // If we've already analyzed the instruction, there's nothing to do.
5580     if (ScalarCosts.contains(I))
5581       continue;
5582 
5583     // Compute the cost of the vector instruction. Note that this cost already
5584     // includes the scalarization overhead of the predicated instruction.
5585     InstructionCost VectorCost = getInstructionCost(I, VF);
5586 
5587     // Compute the cost of the scalarized instruction. This cost is the cost of
5588     // the instruction as if it wasn't if-converted and instead remained in the
5589     // predicated block. We will scale this cost by block probability after
5590     // computing the scalarization overhead.
5591     InstructionCost ScalarCost =
5592         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5593 
5594     // Compute the scalarization overhead of needed insertelement instructions
5595     // and phi nodes.
5596     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5597     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5598       ScalarCost += TTI.getScalarizationOverhead(
5599           cast<VectorType>(toVectorTy(I->getType(), VF)),
5600           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5601           /*Extract*/ false, CostKind);
5602       ScalarCost +=
5603           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5604     }
5605 
5606     // Compute the scalarization overhead of needed extractelement
5607     // instructions. For each of the instruction's operands, if the operand can
5608     // be scalarized, add it to the worklist; otherwise, account for the
5609     // overhead.
5610     for (Use &U : I->operands())
5611       if (auto *J = dyn_cast<Instruction>(U.get())) {
5612         assert(VectorType::isValidElementType(J->getType()) &&
5613                "Instruction has non-scalar type");
5614         if (CanBeScalarized(J))
5615           Worklist.push_back(J);
5616         else if (needsExtract(J, VF)) {
5617           ScalarCost += TTI.getScalarizationOverhead(
5618               cast<VectorType>(toVectorTy(J->getType(), VF)),
5619               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5620               /*Extract*/ true, CostKind);
5621         }
5622       }
5623 
5624     // Scale the total scalar cost by block probability.
5625     ScalarCost /= getReciprocalPredBlockProb();
5626 
5627     // Compute the discount. A non-negative discount means the vector version
5628     // of the instruction costs more, and scalarizing would be beneficial.
5629     Discount += VectorCost - ScalarCost;
5630     ScalarCosts[I] = ScalarCost;
5631   }
5632 
5633   return Discount;
5634 }
5635 
5636 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5637   InstructionCost Cost;
5638 
5639   // If the vector loop gets executed exactly once with the given VF, ignore the
5640   // costs of comparison and induction instructions, as they'll get simplified
5641   // away.
5642   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5643   auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5644   if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5645     addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5646                                          ValuesToIgnoreForVF);
5647 
5648   // For each block.
5649   for (BasicBlock *BB : TheLoop->blocks()) {
5650     InstructionCost BlockCost;
5651 
5652     // For each instruction in the old loop.
5653     for (Instruction &I : BB->instructionsWithoutDebug()) {
5654       // Skip ignored values.
5655       if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5656           (VF.isVector() && VecValuesToIgnore.count(&I)))
5657         continue;
5658 
5659       InstructionCost C = getInstructionCost(&I, VF);
5660 
5661       // Check if we should override the cost.
5662       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5663         C = InstructionCost(ForceTargetInstructionCost);
5664 
5665       BlockCost += C;
5666       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5667                         << VF << " For instruction: " << I << '\n');
5668     }
5669 
5670     // If we are vectorizing a predicated block, it will have been
5671     // if-converted. This means that the block's instructions (aside from
5672     // stores and instructions that may divide by zero) will now be
5673     // unconditionally executed. For the scalar case, we may not always execute
5674     // the predicated block, if it is an if-else block. Thus, scale the block's
5675     // cost by the probability of executing it. blockNeedsPredication from
5676     // Legal is used so as to not include all blocks in tail folded loops.
5677     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5678       BlockCost /= getReciprocalPredBlockProb();
5679 
5680     Cost += BlockCost;
5681   }
5682 
5683   return Cost;
5684 }
5685 
5686 /// Gets Address Access SCEV after verifying that the access pattern
5687 /// is loop invariant except the induction variable dependence.
5688 ///
5689 /// This SCEV can be sent to the Target in order to estimate the address
5690 /// calculation cost.
5691 static const SCEV *getAddressAccessSCEV(
5692               Value *Ptr,
5693               LoopVectorizationLegality *Legal,
5694               PredicatedScalarEvolution &PSE,
5695               const Loop *TheLoop) {
5696 
5697   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5698   if (!Gep)
5699     return nullptr;
5700 
5701   // We are looking for a gep with all loop invariant indices except for one
5702   // which should be an induction variable.
5703   auto *SE = PSE.getSE();
5704   unsigned NumOperands = Gep->getNumOperands();
5705   for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5706     Value *Opd = Gep->getOperand(Idx);
5707     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5708         !Legal->isInductionVariable(Opd))
5709       return nullptr;
5710   }
5711 
5712   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5713   return PSE.getSCEV(Ptr);
5714 }
5715 
5716 InstructionCost
5717 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5718                                                         ElementCount VF) {
5719   assert(VF.isVector() &&
5720          "Scalarization cost of instruction implies vectorization.");
5721   if (VF.isScalable())
5722     return InstructionCost::getInvalid();
5723 
5724   Type *ValTy = getLoadStoreType(I);
5725   auto *SE = PSE.getSE();
5726 
5727   unsigned AS = getLoadStoreAddressSpace(I);
5728   Value *Ptr = getLoadStorePointerOperand(I);
5729   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5730   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5731   //       that it is being called from this specific place.
5732 
5733   // Figure out whether the access is strided and get the stride value
5734   // if it's known in compile time
5735   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5736 
5737   // Get the cost of the scalar memory instruction and address computation.
5738   InstructionCost Cost =
5739       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5740 
5741   // Don't pass *I here, since it is scalar but will actually be part of a
5742   // vectorized loop where the user of it is a vectorized instruction.
5743   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5744   const Align Alignment = getLoadStoreAlignment(I);
5745   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5746                                                       ValTy->getScalarType(),
5747                                                       Alignment, AS, CostKind);
5748 
5749   // Get the overhead of the extractelement and insertelement instructions
5750   // we might create due to scalarization.
5751   Cost += getScalarizationOverhead(I, VF, CostKind);
5752 
5753   // If we have a predicated load/store, it will need extra i1 extracts and
5754   // conditional branches, but may not be executed for each vector lane. Scale
5755   // the cost by the probability of executing the predicated block.
5756   if (isPredicatedInst(I)) {
5757     Cost /= getReciprocalPredBlockProb();
5758 
5759     // Add the cost of an i1 extract and a branch
5760     auto *VecI1Ty =
5761         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5762     Cost += TTI.getScalarizationOverhead(
5763         VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5764         /*Insert=*/false, /*Extract=*/true, CostKind);
5765     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5766 
5767     if (useEmulatedMaskMemRefHack(I, VF))
5768       // Artificially setting to a high enough value to practically disable
5769       // vectorization with such operations.
5770       Cost = 3000000;
5771   }
5772 
5773   return Cost;
5774 }
5775 
5776 InstructionCost
5777 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5778                                                     ElementCount VF) {
5779   Type *ValTy = getLoadStoreType(I);
5780   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5781   Value *Ptr = getLoadStorePointerOperand(I);
5782   unsigned AS = getLoadStoreAddressSpace(I);
5783   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5784   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5785 
5786   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5787          "Stride should be 1 or -1 for consecutive memory access");
5788   const Align Alignment = getLoadStoreAlignment(I);
5789   InstructionCost Cost = 0;
5790   if (Legal->isMaskRequired(I)) {
5791     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5792                                       CostKind);
5793   } else {
5794     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5795     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5796                                 CostKind, OpInfo, I);
5797   }
5798 
5799   bool Reverse = ConsecutiveStride < 0;
5800   if (Reverse)
5801     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5802                                CostKind, 0);
5803   return Cost;
5804 }
5805 
5806 InstructionCost
5807 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5808                                                 ElementCount VF) {
5809   assert(Legal->isUniformMemOp(*I, VF));
5810 
5811   Type *ValTy = getLoadStoreType(I);
5812   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5813   const Align Alignment = getLoadStoreAlignment(I);
5814   unsigned AS = getLoadStoreAddressSpace(I);
5815   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5816   if (isa<LoadInst>(I)) {
5817     return TTI.getAddressComputationCost(ValTy) +
5818            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5819                                CostKind) +
5820            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5821   }
5822   StoreInst *SI = cast<StoreInst>(I);
5823 
5824   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5825   return TTI.getAddressComputationCost(ValTy) +
5826          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5827                              CostKind) +
5828          (IsLoopInvariantStoreValue
5829               ? 0
5830               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5831                                        CostKind, VF.getKnownMinValue() - 1));
5832 }
5833 
5834 InstructionCost
5835 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5836                                                  ElementCount VF) {
5837   Type *ValTy = getLoadStoreType(I);
5838   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5839   const Align Alignment = getLoadStoreAlignment(I);
5840   const Value *Ptr = getLoadStorePointerOperand(I);
5841 
5842   return TTI.getAddressComputationCost(VectorTy) +
5843          TTI.getGatherScatterOpCost(
5844              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5845              TargetTransformInfo::TCK_RecipThroughput, I);
5846 }
5847 
5848 InstructionCost
5849 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5850                                                    ElementCount VF) {
5851   const auto *Group = getInterleavedAccessGroup(I);
5852   assert(Group && "Fail to get an interleaved access group.");
5853 
5854   Instruction *InsertPos = Group->getInsertPos();
5855   Type *ValTy = getLoadStoreType(InsertPos);
5856   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5857   unsigned AS = getLoadStoreAddressSpace(InsertPos);
5858   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5859 
5860   unsigned InterleaveFactor = Group->getFactor();
5861   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5862 
5863   // Holds the indices of existing members in the interleaved group.
5864   SmallVector<unsigned, 4> Indices;
5865   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5866     if (Group->getMember(IF))
5867       Indices.push_back(IF);
5868 
5869   // Calculate the cost of the whole interleaved group.
5870   bool UseMaskForGaps =
5871       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5872       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5873   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5874       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5875       Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5876       UseMaskForGaps);
5877 
5878   if (Group->isReverse()) {
5879     // TODO: Add support for reversed masked interleaved access.
5880     assert(!Legal->isMaskRequired(I) &&
5881            "Reverse masked interleaved access not supported.");
5882     Cost += Group->getNumMembers() *
5883             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5884                                CostKind, 0);
5885   }
5886   return Cost;
5887 }
5888 
5889 std::optional<InstructionCost>
5890 LoopVectorizationCostModel::getReductionPatternCost(
5891     Instruction *I, ElementCount VF, Type *Ty,
5892     TTI::TargetCostKind CostKind) const {
5893   using namespace llvm::PatternMatch;
5894   // Early exit for no inloop reductions
5895   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5896     return std::nullopt;
5897   auto *VectorTy = cast<VectorType>(Ty);
5898 
5899   // We are looking for a pattern of, and finding the minimal acceptable cost:
5900   //  reduce(mul(ext(A), ext(B))) or
5901   //  reduce(mul(A, B)) or
5902   //  reduce(ext(A)) or
5903   //  reduce(A).
5904   // The basic idea is that we walk down the tree to do that, finding the root
5905   // reduction instruction in InLoopReductionImmediateChains. From there we find
5906   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5907   // of the components. If the reduction cost is lower then we return it for the
5908   // reduction instruction and 0 for the other instructions in the pattern. If
5909   // it is not we return an invalid cost specifying the orignal cost method
5910   // should be used.
5911   Instruction *RetI = I;
5912   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5913     if (!RetI->hasOneUser())
5914       return std::nullopt;
5915     RetI = RetI->user_back();
5916   }
5917 
5918   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5919       RetI->user_back()->getOpcode() == Instruction::Add) {
5920     RetI = RetI->user_back();
5921   }
5922 
5923   // Test if the found instruction is a reduction, and if not return an invalid
5924   // cost specifying the parent to use the original cost modelling.
5925   if (!InLoopReductionImmediateChains.count(RetI))
5926     return std::nullopt;
5927 
5928   // Find the reduction this chain is a part of and calculate the basic cost of
5929   // the reduction on its own.
5930   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5931   Instruction *ReductionPhi = LastChain;
5932   while (!isa<PHINode>(ReductionPhi))
5933     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5934 
5935   const RecurrenceDescriptor &RdxDesc =
5936       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5937 
5938   InstructionCost BaseCost;
5939   RecurKind RK = RdxDesc.getRecurrenceKind();
5940   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5941     Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5942     BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5943                                           RdxDesc.getFastMathFlags(), CostKind);
5944   } else {
5945     BaseCost = TTI.getArithmeticReductionCost(
5946         RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5947   }
5948 
5949   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5950   // normal fmul instruction to the cost of the fadd reduction.
5951   if (RK == RecurKind::FMulAdd)
5952     BaseCost +=
5953         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5954 
5955   // If we're using ordered reductions then we can just return the base cost
5956   // here, since getArithmeticReductionCost calculates the full ordered
5957   // reduction cost when FP reassociation is not allowed.
5958   if (useOrderedReductions(RdxDesc))
5959     return BaseCost;
5960 
5961   // Get the operand that was not the reduction chain and match it to one of the
5962   // patterns, returning the better cost if it is found.
5963   Instruction *RedOp = RetI->getOperand(1) == LastChain
5964                            ? dyn_cast<Instruction>(RetI->getOperand(0))
5965                            : dyn_cast<Instruction>(RetI->getOperand(1));
5966 
5967   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5968 
5969   Instruction *Op0, *Op1;
5970   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5971       match(RedOp,
5972             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5973       match(Op0, m_ZExtOrSExt(m_Value())) &&
5974       Op0->getOpcode() == Op1->getOpcode() &&
5975       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5976       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5977       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5978 
5979     // Matched reduce.add(ext(mul(ext(A), ext(B)))
5980     // Note that the extend opcodes need to all match, or if A==B they will have
5981     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5982     // which is equally fine.
5983     bool IsUnsigned = isa<ZExtInst>(Op0);
5984     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5985     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5986 
5987     InstructionCost ExtCost =
5988         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5989                              TTI::CastContextHint::None, CostKind, Op0);
5990     InstructionCost MulCost =
5991         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5992     InstructionCost Ext2Cost =
5993         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5994                              TTI::CastContextHint::None, CostKind, RedOp);
5995 
5996     InstructionCost RedCost = TTI.getMulAccReductionCost(
5997         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5998 
5999     if (RedCost.isValid() &&
6000         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6001       return I == RetI ? RedCost : 0;
6002   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6003              !TheLoop->isLoopInvariant(RedOp)) {
6004     // Matched reduce(ext(A))
6005     bool IsUnsigned = isa<ZExtInst>(RedOp);
6006     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6007     InstructionCost RedCost = TTI.getExtendedReductionCost(
6008         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6009         RdxDesc.getFastMathFlags(), CostKind);
6010 
6011     InstructionCost ExtCost =
6012         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6013                              TTI::CastContextHint::None, CostKind, RedOp);
6014     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6015       return I == RetI ? RedCost : 0;
6016   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6017              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6018     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6019         Op0->getOpcode() == Op1->getOpcode() &&
6020         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6021       bool IsUnsigned = isa<ZExtInst>(Op0);
6022       Type *Op0Ty = Op0->getOperand(0)->getType();
6023       Type *Op1Ty = Op1->getOperand(0)->getType();
6024       Type *LargestOpTy =
6025           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6026                                                                     : Op0Ty;
6027       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6028 
6029       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6030       // different sizes. We take the largest type as the ext to reduce, and add
6031       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6032       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6033           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6034           TTI::CastContextHint::None, CostKind, Op0);
6035       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6036           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6037           TTI::CastContextHint::None, CostKind, Op1);
6038       InstructionCost MulCost =
6039           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6040 
6041       InstructionCost RedCost = TTI.getMulAccReductionCost(
6042           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6043       InstructionCost ExtraExtCost = 0;
6044       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6045         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6046         ExtraExtCost = TTI.getCastInstrCost(
6047             ExtraExtOp->getOpcode(), ExtType,
6048             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6049             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6050       }
6051 
6052       if (RedCost.isValid() &&
6053           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6054         return I == RetI ? RedCost : 0;
6055     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6056       // Matched reduce.add(mul())
6057       InstructionCost MulCost =
6058           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6059 
6060       InstructionCost RedCost = TTI.getMulAccReductionCost(
6061           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6062 
6063       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6064         return I == RetI ? RedCost : 0;
6065     }
6066   }
6067 
6068   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6069 }
6070 
6071 InstructionCost
6072 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6073                                                      ElementCount VF) {
6074   // Calculate scalar cost only. Vectorization cost should be ready at this
6075   // moment.
6076   if (VF.isScalar()) {
6077     Type *ValTy = getLoadStoreType(I);
6078     const Align Alignment = getLoadStoreAlignment(I);
6079     unsigned AS = getLoadStoreAddressSpace(I);
6080 
6081     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6082     return TTI.getAddressComputationCost(ValTy) +
6083            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6084                                TTI::TCK_RecipThroughput, OpInfo, I);
6085   }
6086   return getWideningCost(I, VF);
6087 }
6088 
6089 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6090     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6091 
6092   // There is no mechanism yet to create a scalable scalarization loop,
6093   // so this is currently Invalid.
6094   if (VF.isScalable())
6095     return InstructionCost::getInvalid();
6096 
6097   if (VF.isScalar())
6098     return 0;
6099 
6100   InstructionCost Cost = 0;
6101   Type *RetTy = toVectorTy(I->getType(), VF);
6102   if (!RetTy->isVoidTy() &&
6103       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6104     Cost += TTI.getScalarizationOverhead(
6105         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6106         /*Insert*/ true,
6107         /*Extract*/ false, CostKind);
6108 
6109   // Some targets keep addresses scalar.
6110   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6111     return Cost;
6112 
6113   // Some targets support efficient element stores.
6114   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6115     return Cost;
6116 
6117   // Collect operands to consider.
6118   CallInst *CI = dyn_cast<CallInst>(I);
6119   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6120 
6121   // Skip operands that do not require extraction/scalarization and do not incur
6122   // any overhead.
6123   SmallVector<Type *> Tys;
6124   for (auto *V : filterExtractingOperands(Ops, VF))
6125     Tys.push_back(maybeVectorizeType(V->getType(), VF));
6126   return Cost + TTI.getOperandsScalarizationOverhead(
6127                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6128 }
6129 
6130 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6131   if (VF.isScalar())
6132     return;
6133   NumPredStores = 0;
6134   for (BasicBlock *BB : TheLoop->blocks()) {
6135     // For each instruction in the old loop.
6136     for (Instruction &I : *BB) {
6137       Value *Ptr =  getLoadStorePointerOperand(&I);
6138       if (!Ptr)
6139         continue;
6140 
6141       // TODO: We should generate better code and update the cost model for
6142       // predicated uniform stores. Today they are treated as any other
6143       // predicated store (see added test cases in
6144       // invariant-store-vectorization.ll).
6145       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6146         NumPredStores++;
6147 
6148       if (Legal->isUniformMemOp(I, VF)) {
6149         auto IsLegalToScalarize = [&]() {
6150           if (!VF.isScalable())
6151             // Scalarization of fixed length vectors "just works".
6152             return true;
6153 
6154           // We have dedicated lowering for unpredicated uniform loads and
6155           // stores.  Note that even with tail folding we know that at least
6156           // one lane is active (i.e. generalized predication is not possible
6157           // here), and the logic below depends on this fact.
6158           if (!foldTailByMasking())
6159             return true;
6160 
6161           // For scalable vectors, a uniform memop load is always
6162           // uniform-by-parts  and we know how to scalarize that.
6163           if (isa<LoadInst>(I))
6164             return true;
6165 
6166           // A uniform store isn't neccessarily uniform-by-part
6167           // and we can't assume scalarization.
6168           auto &SI = cast<StoreInst>(I);
6169           return TheLoop->isLoopInvariant(SI.getValueOperand());
6170         };
6171 
6172         const InstructionCost GatherScatterCost =
6173           isLegalGatherOrScatter(&I, VF) ?
6174           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6175 
6176         // Load: Scalar load + broadcast
6177         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6178         // FIXME: This cost is a significant under-estimate for tail folded
6179         // memory ops.
6180         const InstructionCost ScalarizationCost =
6181             IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6182                                  : InstructionCost::getInvalid();
6183 
6184         // Choose better solution for the current VF,  Note that Invalid
6185         // costs compare as maximumal large.  If both are invalid, we get
6186         // scalable invalid which signals a failure and a vectorization abort.
6187         if (GatherScatterCost < ScalarizationCost)
6188           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6189         else
6190           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6191         continue;
6192       }
6193 
6194       // We assume that widening is the best solution when possible.
6195       if (memoryInstructionCanBeWidened(&I, VF)) {
6196         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6197         int ConsecutiveStride = Legal->isConsecutivePtr(
6198             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6199         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6200                "Expected consecutive stride.");
6201         InstWidening Decision =
6202             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6203         setWideningDecision(&I, VF, Decision, Cost);
6204         continue;
6205       }
6206 
6207       // Choose between Interleaving, Gather/Scatter or Scalarization.
6208       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6209       unsigned NumAccesses = 1;
6210       if (isAccessInterleaved(&I)) {
6211         const auto *Group = getInterleavedAccessGroup(&I);
6212         assert(Group && "Fail to get an interleaved access group.");
6213 
6214         // Make one decision for the whole group.
6215         if (getWideningDecision(&I, VF) != CM_Unknown)
6216           continue;
6217 
6218         NumAccesses = Group->getNumMembers();
6219         if (interleavedAccessCanBeWidened(&I, VF))
6220           InterleaveCost = getInterleaveGroupCost(&I, VF);
6221       }
6222 
6223       InstructionCost GatherScatterCost =
6224           isLegalGatherOrScatter(&I, VF)
6225               ? getGatherScatterCost(&I, VF) * NumAccesses
6226               : InstructionCost::getInvalid();
6227 
6228       InstructionCost ScalarizationCost =
6229           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6230 
6231       // Choose better solution for the current VF,
6232       // write down this decision and use it during vectorization.
6233       InstructionCost Cost;
6234       InstWidening Decision;
6235       if (InterleaveCost <= GatherScatterCost &&
6236           InterleaveCost < ScalarizationCost) {
6237         Decision = CM_Interleave;
6238         Cost = InterleaveCost;
6239       } else if (GatherScatterCost < ScalarizationCost) {
6240         Decision = CM_GatherScatter;
6241         Cost = GatherScatterCost;
6242       } else {
6243         Decision = CM_Scalarize;
6244         Cost = ScalarizationCost;
6245       }
6246       // If the instructions belongs to an interleave group, the whole group
6247       // receives the same decision. The whole group receives the cost, but
6248       // the cost will actually be assigned to one instruction.
6249       if (const auto *Group = getInterleavedAccessGroup(&I))
6250         setWideningDecision(Group, VF, Decision, Cost);
6251       else
6252         setWideningDecision(&I, VF, Decision, Cost);
6253     }
6254   }
6255 
6256   // Make sure that any load of address and any other address computation
6257   // remains scalar unless there is gather/scatter support. This avoids
6258   // inevitable extracts into address registers, and also has the benefit of
6259   // activating LSR more, since that pass can't optimize vectorized
6260   // addresses.
6261   if (TTI.prefersVectorizedAddressing())
6262     return;
6263 
6264   // Start with all scalar pointer uses.
6265   SmallPtrSet<Instruction *, 8> AddrDefs;
6266   for (BasicBlock *BB : TheLoop->blocks())
6267     for (Instruction &I : *BB) {
6268       Instruction *PtrDef =
6269         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6270       if (PtrDef && TheLoop->contains(PtrDef) &&
6271           getWideningDecision(&I, VF) != CM_GatherScatter)
6272         AddrDefs.insert(PtrDef);
6273     }
6274 
6275   // Add all instructions used to generate the addresses.
6276   SmallVector<Instruction *, 4> Worklist;
6277   append_range(Worklist, AddrDefs);
6278   while (!Worklist.empty()) {
6279     Instruction *I = Worklist.pop_back_val();
6280     for (auto &Op : I->operands())
6281       if (auto *InstOp = dyn_cast<Instruction>(Op))
6282         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6283             AddrDefs.insert(InstOp).second)
6284           Worklist.push_back(InstOp);
6285   }
6286 
6287   for (auto *I : AddrDefs) {
6288     if (isa<LoadInst>(I)) {
6289       // Setting the desired widening decision should ideally be handled in
6290       // by cost functions, but since this involves the task of finding out
6291       // if the loaded register is involved in an address computation, it is
6292       // instead changed here when we know this is the case.
6293       InstWidening Decision = getWideningDecision(I, VF);
6294       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6295         // Scalarize a widened load of address.
6296         setWideningDecision(
6297             I, VF, CM_Scalarize,
6298             (VF.getKnownMinValue() *
6299              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6300       else if (const auto *Group = getInterleavedAccessGroup(I)) {
6301         // Scalarize an interleave group of address loads.
6302         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6303           if (Instruction *Member = Group->getMember(I))
6304             setWideningDecision(
6305                 Member, VF, CM_Scalarize,
6306                 (VF.getKnownMinValue() *
6307                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6308         }
6309       }
6310     } else
6311       // Make sure I gets scalarized and a cost estimate without
6312       // scalarization overhead.
6313       ForcedScalars[VF].insert(I);
6314   }
6315 }
6316 
6317 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6318   assert(!VF.isScalar() &&
6319          "Trying to set a vectorization decision for a scalar VF");
6320 
6321   auto ForcedScalar = ForcedScalars.find(VF);
6322   for (BasicBlock *BB : TheLoop->blocks()) {
6323     // For each instruction in the old loop.
6324     for (Instruction &I : *BB) {
6325       CallInst *CI = dyn_cast<CallInst>(&I);
6326 
6327       if (!CI)
6328         continue;
6329 
6330       InstructionCost ScalarCost = InstructionCost::getInvalid();
6331       InstructionCost VectorCost = InstructionCost::getInvalid();
6332       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6333       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6334       Function *ScalarFunc = CI->getCalledFunction();
6335       Type *ScalarRetTy = CI->getType();
6336       SmallVector<Type *, 4> Tys, ScalarTys;
6337       for (auto &ArgOp : CI->args())
6338         ScalarTys.push_back(ArgOp->getType());
6339 
6340       // Estimate cost of scalarized vector call. The source operands are
6341       // assumed to be vectors, so we need to extract individual elements from
6342       // there, execute VF scalar calls, and then gather the result into the
6343       // vector return value.
6344       InstructionCost ScalarCallCost =
6345           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6346 
6347       // Compute costs of unpacking argument values for the scalar calls and
6348       // packing the return values to a vector.
6349       InstructionCost ScalarizationCost =
6350           getScalarizationOverhead(CI, VF, CostKind);
6351 
6352       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6353       // Honor ForcedScalars and UniformAfterVectorization decisions.
6354       // TODO: For calls, it might still be more profitable to widen. Use
6355       // VPlan-based cost model to compare different options.
6356       if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6357                              ForcedScalar->second.contains(CI)) ||
6358                             isUniformAfterVectorization(CI, VF))) {
6359         setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6360                                 Intrinsic::not_intrinsic, std::nullopt,
6361                                 ScalarCost);
6362         continue;
6363       }
6364 
6365       bool MaskRequired = Legal->isMaskRequired(CI);
6366       // Compute corresponding vector type for return value and arguments.
6367       Type *RetTy = toVectorTy(ScalarRetTy, VF);
6368       for (Type *ScalarTy : ScalarTys)
6369         Tys.push_back(toVectorTy(ScalarTy, VF));
6370 
6371       // An in-loop reduction using an fmuladd intrinsic is a special case;
6372       // we don't want the normal cost for that intrinsic.
6373       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6374         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6375           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6376                                   getVectorIntrinsicIDForCall(CI, TLI),
6377                                   std::nullopt, *RedCost);
6378           continue;
6379         }
6380 
6381       // Find the cost of vectorizing the call, if we can find a suitable
6382       // vector variant of the function.
6383       bool UsesMask = false;
6384       VFInfo FuncInfo;
6385       Function *VecFunc = nullptr;
6386       // Search through any available variants for one we can use at this VF.
6387       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6388         // Must match requested VF.
6389         if (Info.Shape.VF != VF)
6390           continue;
6391 
6392         // Must take a mask argument if one is required
6393         if (MaskRequired && !Info.isMasked())
6394           continue;
6395 
6396         // Check that all parameter kinds are supported
6397         bool ParamsOk = true;
6398         for (VFParameter Param : Info.Shape.Parameters) {
6399           switch (Param.ParamKind) {
6400           case VFParamKind::Vector:
6401             break;
6402           case VFParamKind::OMP_Uniform: {
6403             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6404             // Make sure the scalar parameter in the loop is invariant.
6405             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6406                                               TheLoop))
6407               ParamsOk = false;
6408             break;
6409           }
6410           case VFParamKind::OMP_Linear: {
6411             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6412             // Find the stride for the scalar parameter in this loop and see if
6413             // it matches the stride for the variant.
6414             // TODO: do we need to figure out the cost of an extract to get the
6415             // first lane? Or do we hope that it will be folded away?
6416             ScalarEvolution *SE = PSE.getSE();
6417             const auto *SAR =
6418                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6419 
6420             if (!SAR || SAR->getLoop() != TheLoop) {
6421               ParamsOk = false;
6422               break;
6423             }
6424 
6425             const SCEVConstant *Step =
6426                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6427 
6428             if (!Step ||
6429                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6430               ParamsOk = false;
6431 
6432             break;
6433           }
6434           case VFParamKind::GlobalPredicate:
6435             UsesMask = true;
6436             break;
6437           default:
6438             ParamsOk = false;
6439             break;
6440           }
6441         }
6442 
6443         if (!ParamsOk)
6444           continue;
6445 
6446         // Found a suitable candidate, stop here.
6447         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6448         FuncInfo = Info;
6449         break;
6450       }
6451 
6452       // Add in the cost of synthesizing a mask if one wasn't required.
6453       InstructionCost MaskCost = 0;
6454       if (VecFunc && UsesMask && !MaskRequired)
6455         MaskCost = TTI.getShuffleCost(
6456             TargetTransformInfo::SK_Broadcast,
6457             VectorType::get(IntegerType::getInt1Ty(
6458                                 VecFunc->getFunctionType()->getContext()),
6459                             VF));
6460 
6461       if (TLI && VecFunc && !CI->isNoBuiltin())
6462         VectorCost =
6463             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6464 
6465       // Find the cost of an intrinsic; some targets may have instructions that
6466       // perform the operation without needing an actual call.
6467       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6468       if (IID != Intrinsic::not_intrinsic)
6469         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6470 
6471       InstructionCost Cost = ScalarCost;
6472       InstWidening Decision = CM_Scalarize;
6473 
6474       if (VectorCost <= Cost) {
6475         Cost = VectorCost;
6476         Decision = CM_VectorCall;
6477       }
6478 
6479       if (IntrinsicCost <= Cost) {
6480         Cost = IntrinsicCost;
6481         Decision = CM_IntrinsicCall;
6482       }
6483 
6484       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6485                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6486     }
6487   }
6488 }
6489 
6490 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6491   if (!Legal->isInvariant(Op))
6492     return false;
6493   // Consider Op invariant, if it or its operands aren't predicated
6494   // instruction in the loop. In that case, it is not trivially hoistable.
6495   auto *OpI = dyn_cast<Instruction>(Op);
6496   return !OpI || !TheLoop->contains(OpI) ||
6497          (!isPredicatedInst(OpI) &&
6498           (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6499           all_of(OpI->operands(),
6500                  [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6501 }
6502 
6503 InstructionCost
6504 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6505                                                ElementCount VF) {
6506   // If we know that this instruction will remain uniform, check the cost of
6507   // the scalar version.
6508   if (isUniformAfterVectorization(I, VF))
6509     VF = ElementCount::getFixed(1);
6510 
6511   if (VF.isVector() && isProfitableToScalarize(I, VF))
6512     return InstsToScalarize[VF][I];
6513 
6514   // Forced scalars do not have any scalarization overhead.
6515   auto ForcedScalar = ForcedScalars.find(VF);
6516   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6517     auto InstSet = ForcedScalar->second;
6518     if (InstSet.count(I))
6519       return getInstructionCost(I, ElementCount::getFixed(1)) *
6520              VF.getKnownMinValue();
6521   }
6522 
6523   Type *RetTy = I->getType();
6524   if (canTruncateToMinimalBitwidth(I, VF))
6525     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6526   auto *SE = PSE.getSE();
6527   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6528 
6529   auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6530                                                 ElementCount VF) -> bool {
6531     if (VF.isScalar())
6532       return true;
6533 
6534     auto Scalarized = InstsToScalarize.find(VF);
6535     assert(Scalarized != InstsToScalarize.end() &&
6536            "VF not yet analyzed for scalarization profitability");
6537     return !Scalarized->second.count(I) &&
6538            llvm::all_of(I->users(), [&](User *U) {
6539              auto *UI = cast<Instruction>(U);
6540              return !Scalarized->second.count(UI);
6541            });
6542   };
6543   (void)HasSingleCopyAfterVectorization;
6544 
6545   Type *VectorTy;
6546   if (isScalarAfterVectorization(I, VF)) {
6547     // With the exception of GEPs and PHIs, after scalarization there should
6548     // only be one copy of the instruction generated in the loop. This is
6549     // because the VF is either 1, or any instructions that need scalarizing
6550     // have already been dealt with by the time we get here. As a result,
6551     // it means we don't have to multiply the instruction cost by VF.
6552     assert(I->getOpcode() == Instruction::GetElementPtr ||
6553            I->getOpcode() == Instruction::PHI ||
6554            (I->getOpcode() == Instruction::BitCast &&
6555             I->getType()->isPointerTy()) ||
6556            HasSingleCopyAfterVectorization(I, VF));
6557     VectorTy = RetTy;
6558   } else
6559     VectorTy = toVectorTy(RetTy, VF);
6560 
6561   if (VF.isVector() && VectorTy->isVectorTy() &&
6562       !TTI.getNumberOfParts(VectorTy))
6563     return InstructionCost::getInvalid();
6564 
6565   // TODO: We need to estimate the cost of intrinsic calls.
6566   switch (I->getOpcode()) {
6567   case Instruction::GetElementPtr:
6568     // We mark this instruction as zero-cost because the cost of GEPs in
6569     // vectorized code depends on whether the corresponding memory instruction
6570     // is scalarized or not. Therefore, we handle GEPs with the memory
6571     // instruction cost.
6572     return 0;
6573   case Instruction::Br: {
6574     // In cases of scalarized and predicated instructions, there will be VF
6575     // predicated blocks in the vectorized loop. Each branch around these
6576     // blocks requires also an extract of its vector compare i1 element.
6577     // Note that the conditional branch from the loop latch will be replaced by
6578     // a single branch controlling the loop, so there is no extra overhead from
6579     // scalarization.
6580     bool ScalarPredicatedBB = false;
6581     BranchInst *BI = cast<BranchInst>(I);
6582     if (VF.isVector() && BI->isConditional() &&
6583         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6584          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6585         BI->getParent() != TheLoop->getLoopLatch())
6586       ScalarPredicatedBB = true;
6587 
6588     if (ScalarPredicatedBB) {
6589       // Not possible to scalarize scalable vector with predicated instructions.
6590       if (VF.isScalable())
6591         return InstructionCost::getInvalid();
6592       // Return cost for branches around scalarized and predicated blocks.
6593       auto *VecI1Ty =
6594           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6595       return (
6596           TTI.getScalarizationOverhead(
6597               VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6598               /*Insert*/ false, /*Extract*/ true, CostKind) +
6599           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6600     }
6601 
6602     if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6603       // The back-edge branch will remain, as will all scalar branches.
6604       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6605 
6606     // This branch will be eliminated by if-conversion.
6607     return 0;
6608     // Note: We currently assume zero cost for an unconditional branch inside
6609     // a predicated block since it will become a fall-through, although we
6610     // may decide in the future to call TTI for all branches.
6611   }
6612   case Instruction::Switch: {
6613     if (VF.isScalar())
6614       return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6615     auto *Switch = cast<SwitchInst>(I);
6616     return Switch->getNumCases() *
6617            TTI.getCmpSelInstrCost(
6618                Instruction::ICmp,
6619                toVectorTy(Switch->getCondition()->getType(), VF),
6620                toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6621                CmpInst::ICMP_EQ, CostKind);
6622   }
6623   case Instruction::PHI: {
6624     auto *Phi = cast<PHINode>(I);
6625 
6626     // First-order recurrences are replaced by vector shuffles inside the loop.
6627     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6628       // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6629       // penultimate value of the recurrence.
6630       // TODO: Consider vscale_range info.
6631       if (VF.isScalable() && VF.getKnownMinValue() == 1)
6632         return InstructionCost::getInvalid();
6633       SmallVector<int> Mask(VF.getKnownMinValue());
6634       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6635       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6636                                 cast<VectorType>(VectorTy), Mask, CostKind,
6637                                 VF.getKnownMinValue() - 1);
6638     }
6639 
6640     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6641     // converted into select instructions. We require N - 1 selects per phi
6642     // node, where N is the number of incoming values.
6643     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6644       Type *ResultTy = Phi->getType();
6645 
6646       // All instructions in an Any-of reduction chain are narrowed to bool.
6647       // Check if that is the case for this phi node.
6648       auto *HeaderUser = cast_if_present<PHINode>(
6649           find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6650             auto *Phi = dyn_cast<PHINode>(U);
6651             if (Phi && Phi->getParent() == TheLoop->getHeader())
6652               return Phi;
6653             return nullptr;
6654           }));
6655       if (HeaderUser) {
6656         auto &ReductionVars = Legal->getReductionVars();
6657         auto Iter = ReductionVars.find(HeaderUser);
6658         if (Iter != ReductionVars.end() &&
6659             RecurrenceDescriptor::isAnyOfRecurrenceKind(
6660                 Iter->second.getRecurrenceKind()))
6661           ResultTy = Type::getInt1Ty(Phi->getContext());
6662       }
6663       return (Phi->getNumIncomingValues() - 1) *
6664              TTI.getCmpSelInstrCost(
6665                  Instruction::Select, toVectorTy(ResultTy, VF),
6666                  toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6667                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6668     }
6669 
6670     // When tail folding with EVL, if the phi is part of an out of loop
6671     // reduction then it will be transformed into a wide vp_merge.
6672     if (VF.isVector() && foldTailWithEVL() &&
6673         Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6674       IntrinsicCostAttributes ICA(
6675           Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6676           {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6677       return TTI.getIntrinsicInstrCost(ICA, CostKind);
6678     }
6679 
6680     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6681   }
6682   case Instruction::UDiv:
6683   case Instruction::SDiv:
6684   case Instruction::URem:
6685   case Instruction::SRem:
6686     if (VF.isVector() && isPredicatedInst(I)) {
6687       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6688       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6689         ScalarCost : SafeDivisorCost;
6690     }
6691     // We've proven all lanes safe to speculate, fall through.
6692     [[fallthrough]];
6693   case Instruction::Add:
6694   case Instruction::Sub: {
6695     auto Info = Legal->getHistogramInfo(I);
6696     if (Info && VF.isVector()) {
6697       const HistogramInfo *HGram = Info.value();
6698       // Assume that a non-constant update value (or a constant != 1) requires
6699       // a multiply, and add that into the cost.
6700       InstructionCost MulCost = TTI::TCC_Free;
6701       ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6702       if (!RHS || RHS->getZExtValue() != 1)
6703         MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
6704 
6705       // Find the cost of the histogram operation itself.
6706       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6707       Type *ScalarTy = I->getType();
6708       Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6709       IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6710                                   Type::getVoidTy(I->getContext()),
6711                                   {PtrTy, ScalarTy, MaskTy});
6712 
6713       // Add the costs together with the add/sub operation.
6714       return TTI.getIntrinsicInstrCost(
6715                  ICA, TargetTransformInfo::TCK_RecipThroughput) +
6716              MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
6717     }
6718     [[fallthrough]];
6719   }
6720   case Instruction::FAdd:
6721   case Instruction::FSub:
6722   case Instruction::Mul:
6723   case Instruction::FMul:
6724   case Instruction::FDiv:
6725   case Instruction::FRem:
6726   case Instruction::Shl:
6727   case Instruction::LShr:
6728   case Instruction::AShr:
6729   case Instruction::And:
6730   case Instruction::Or:
6731   case Instruction::Xor: {
6732     // If we're speculating on the stride being 1, the multiplication may
6733     // fold away.  We can generalize this for all operations using the notion
6734     // of neutral elements.  (TODO)
6735     if (I->getOpcode() == Instruction::Mul &&
6736         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6737          PSE.getSCEV(I->getOperand(1))->isOne()))
6738       return 0;
6739 
6740     // Detect reduction patterns
6741     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6742       return *RedCost;
6743 
6744     // Certain instructions can be cheaper to vectorize if they have a constant
6745     // second vector operand. One example of this are shifts on x86.
6746     Value *Op2 = I->getOperand(1);
6747     if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6748         isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6749       Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6750     }
6751     auto Op2Info = TTI.getOperandInfo(Op2);
6752     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6753         shouldConsiderInvariant(Op2))
6754       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6755 
6756     SmallVector<const Value *, 4> Operands(I->operand_values());
6757     return TTI.getArithmeticInstrCost(
6758         I->getOpcode(), VectorTy, CostKind,
6759         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6760         Op2Info, Operands, I, TLI);
6761   }
6762   case Instruction::FNeg: {
6763     return TTI.getArithmeticInstrCost(
6764         I->getOpcode(), VectorTy, CostKind,
6765         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6766         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6767         I->getOperand(0), I);
6768   }
6769   case Instruction::Select: {
6770     SelectInst *SI = cast<SelectInst>(I);
6771     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6772     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6773 
6774     const Value *Op0, *Op1;
6775     using namespace llvm::PatternMatch;
6776     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6777                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6778       // select x, y, false --> x & y
6779       // select x, true, y --> x | y
6780       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6781       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6782       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6783               Op1->getType()->getScalarSizeInBits() == 1);
6784 
6785       SmallVector<const Value *, 2> Operands{Op0, Op1};
6786       return TTI.getArithmeticInstrCost(
6787           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6788           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6789     }
6790 
6791     Type *CondTy = SI->getCondition()->getType();
6792     if (!ScalarCond)
6793       CondTy = VectorType::get(CondTy, VF);
6794 
6795     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6796     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6797       Pred = Cmp->getPredicate();
6798     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6799                                   CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6800                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6801   }
6802   case Instruction::ICmp:
6803   case Instruction::FCmp: {
6804     Type *ValTy = I->getOperand(0)->getType();
6805 
6806     if (canTruncateToMinimalBitwidth(I, VF)) {
6807       Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6808       (void)Op0AsInstruction;
6809       assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6810               MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6811              "if both the operand and the compare are marked for "
6812              "truncation, they must have the same bitwidth");
6813       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6814     }
6815 
6816     VectorTy = toVectorTy(ValTy, VF);
6817     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6818                                   cast<CmpInst>(I)->getPredicate(), CostKind,
6819                                   {TTI::OK_AnyValue, TTI::OP_None},
6820                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6821   }
6822   case Instruction::Store:
6823   case Instruction::Load: {
6824     ElementCount Width = VF;
6825     if (Width.isVector()) {
6826       InstWidening Decision = getWideningDecision(I, Width);
6827       assert(Decision != CM_Unknown &&
6828              "CM decision should be taken at this point");
6829       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6830         return InstructionCost::getInvalid();
6831       if (Decision == CM_Scalarize)
6832         Width = ElementCount::getFixed(1);
6833     }
6834     VectorTy = toVectorTy(getLoadStoreType(I), Width);
6835     return getMemoryInstructionCost(I, VF);
6836   }
6837   case Instruction::BitCast:
6838     if (I->getType()->isPointerTy())
6839       return 0;
6840     [[fallthrough]];
6841   case Instruction::ZExt:
6842   case Instruction::SExt:
6843   case Instruction::FPToUI:
6844   case Instruction::FPToSI:
6845   case Instruction::FPExt:
6846   case Instruction::PtrToInt:
6847   case Instruction::IntToPtr:
6848   case Instruction::SIToFP:
6849   case Instruction::UIToFP:
6850   case Instruction::Trunc:
6851   case Instruction::FPTrunc: {
6852     // Computes the CastContextHint from a Load/Store instruction.
6853     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6854       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6855              "Expected a load or a store!");
6856 
6857       if (VF.isScalar() || !TheLoop->contains(I))
6858         return TTI::CastContextHint::Normal;
6859 
6860       switch (getWideningDecision(I, VF)) {
6861       case LoopVectorizationCostModel::CM_GatherScatter:
6862         return TTI::CastContextHint::GatherScatter;
6863       case LoopVectorizationCostModel::CM_Interleave:
6864         return TTI::CastContextHint::Interleave;
6865       case LoopVectorizationCostModel::CM_Scalarize:
6866       case LoopVectorizationCostModel::CM_Widen:
6867         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6868                                         : TTI::CastContextHint::Normal;
6869       case LoopVectorizationCostModel::CM_Widen_Reverse:
6870         return TTI::CastContextHint::Reversed;
6871       case LoopVectorizationCostModel::CM_Unknown:
6872         llvm_unreachable("Instr did not go through cost modelling?");
6873       case LoopVectorizationCostModel::CM_VectorCall:
6874       case LoopVectorizationCostModel::CM_IntrinsicCall:
6875         llvm_unreachable_internal("Instr has invalid widening decision");
6876       }
6877 
6878       llvm_unreachable("Unhandled case!");
6879     };
6880 
6881     unsigned Opcode = I->getOpcode();
6882     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6883     // For Trunc, the context is the only user, which must be a StoreInst.
6884     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6885       if (I->hasOneUse())
6886         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6887           CCH = ComputeCCH(Store);
6888     }
6889     // For Z/Sext, the context is the operand, which must be a LoadInst.
6890     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6891              Opcode == Instruction::FPExt) {
6892       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6893         CCH = ComputeCCH(Load);
6894     }
6895 
6896     // We optimize the truncation of induction variables having constant
6897     // integer steps. The cost of these truncations is the same as the scalar
6898     // operation.
6899     if (isOptimizableIVTruncate(I, VF)) {
6900       auto *Trunc = cast<TruncInst>(I);
6901       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6902                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6903     }
6904 
6905     // Detect reduction patterns
6906     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6907       return *RedCost;
6908 
6909     Type *SrcScalarTy = I->getOperand(0)->getType();
6910     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6911     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6912       SrcScalarTy =
6913           IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6914     Type *SrcVecTy =
6915         VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6916 
6917     if (canTruncateToMinimalBitwidth(I, VF)) {
6918       // If the result type is <= the source type, there will be no extend
6919       // after truncating the users to the minimal required bitwidth.
6920       if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6921           (I->getOpcode() == Instruction::ZExt ||
6922            I->getOpcode() == Instruction::SExt))
6923         return 0;
6924     }
6925 
6926     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6927   }
6928   case Instruction::Call:
6929     return getVectorCallCost(cast<CallInst>(I), VF);
6930   case Instruction::ExtractValue:
6931     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
6932   case Instruction::Alloca:
6933     // We cannot easily widen alloca to a scalable alloca, as
6934     // the result would need to be a vector of pointers.
6935     if (VF.isScalable())
6936       return InstructionCost::getInvalid();
6937     [[fallthrough]];
6938   default:
6939     // This opcode is unknown. Assume that it is the same as 'mul'.
6940     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6941   } // end of switch.
6942 }
6943 
6944 void LoopVectorizationCostModel::collectValuesToIgnore() {
6945   // Ignore ephemeral values.
6946   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6947 
6948   SmallVector<Value *, 4> DeadInterleavePointerOps;
6949   SmallVector<Value *, 4> DeadOps;
6950 
6951   // If a scalar epilogue is required, users outside the loop won't use
6952   // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6953   // that is the case.
6954   bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6955   auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6956     return RequiresScalarEpilogue &&
6957            !TheLoop->contains(cast<Instruction>(U)->getParent());
6958   };
6959 
6960   LoopBlocksDFS DFS(TheLoop);
6961   DFS.perform(LI);
6962   MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6963   for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6964     for (Instruction &I : reverse(*BB)) {
6965       // Find all stores to invariant variables. Since they are going to sink
6966       // outside the loop we do not need calculate cost for them.
6967       StoreInst *SI;
6968       if ((SI = dyn_cast<StoreInst>(&I)) &&
6969           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6970         ValuesToIgnore.insert(&I);
6971         DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6972             SI->getValueOperand());
6973       }
6974 
6975       if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6976         continue;
6977 
6978       // Add instructions that would be trivially dead and are only used by
6979       // values already ignored to DeadOps to seed worklist.
6980       if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6981           all_of(I.users(), [this, IsLiveOutDead](User *U) {
6982             return VecValuesToIgnore.contains(U) ||
6983                    ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6984           }))
6985         DeadOps.push_back(&I);
6986 
6987       // For interleave groups, we only create a pointer for the start of the
6988       // interleave group. Queue up addresses of group members except the insert
6989       // position for further processing.
6990       if (isAccessInterleaved(&I)) {
6991         auto *Group = getInterleavedAccessGroup(&I);
6992         if (Group->getInsertPos() == &I)
6993           continue;
6994         Value *PointerOp = getLoadStorePointerOperand(&I);
6995         DeadInterleavePointerOps.push_back(PointerOp);
6996       }
6997 
6998       // Queue branches for analysis. They are dead, if their successors only
6999       // contain dead instructions.
7000       if (auto *Br = dyn_cast<BranchInst>(&I)) {
7001         if (Br->isConditional())
7002           DeadOps.push_back(&I);
7003       }
7004     }
7005 
7006   // Mark ops feeding interleave group members as free, if they are only used
7007   // by other dead computations.
7008   for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
7009     auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
7010     if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
7011           Instruction *UI = cast<Instruction>(U);
7012           return !VecValuesToIgnore.contains(U) &&
7013                  (!isAccessInterleaved(UI) ||
7014                   getInterleavedAccessGroup(UI)->getInsertPos() == UI);
7015         }))
7016       continue;
7017     VecValuesToIgnore.insert(Op);
7018     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
7019   }
7020 
7021   for (const auto &[_, Ops] : DeadInvariantStoreOps) {
7022     for (Value *Op : ArrayRef(Ops).drop_back())
7023       DeadOps.push_back(Op);
7024   }
7025   // Mark ops that would be trivially dead and are only used by ignored
7026   // instructions as free.
7027   BasicBlock *Header = TheLoop->getHeader();
7028 
7029   // Returns true if the block contains only dead instructions. Such blocks will
7030   // be removed by VPlan-to-VPlan transforms and won't be considered by the
7031   // VPlan-based cost model, so skip them in the legacy cost-model as well.
7032   auto IsEmptyBlock = [this](BasicBlock *BB) {
7033     return all_of(*BB, [this](Instruction &I) {
7034       return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
7035              (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
7036     });
7037   };
7038   for (unsigned I = 0; I != DeadOps.size(); ++I) {
7039     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
7040 
7041     // Check if the branch should be considered dead.
7042     if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
7043       BasicBlock *ThenBB = Br->getSuccessor(0);
7044       BasicBlock *ElseBB = Br->getSuccessor(1);
7045       // Don't considers branches leaving the loop for simplification.
7046       if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
7047         continue;
7048       bool ThenEmpty = IsEmptyBlock(ThenBB);
7049       bool ElseEmpty = IsEmptyBlock(ElseBB);
7050       if ((ThenEmpty && ElseEmpty) ||
7051           (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
7052            ElseBB->phis().empty()) ||
7053           (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
7054            ThenBB->phis().empty())) {
7055         VecValuesToIgnore.insert(Br);
7056         DeadOps.push_back(Br->getCondition());
7057       }
7058       continue;
7059     }
7060 
7061     // Skip any op that shouldn't be considered dead.
7062     if (!Op || !TheLoop->contains(Op) ||
7063         (isa<PHINode>(Op) && Op->getParent() == Header) ||
7064         !wouldInstructionBeTriviallyDead(Op, TLI) ||
7065         any_of(Op->users(), [this, IsLiveOutDead](User *U) {
7066           return !VecValuesToIgnore.contains(U) &&
7067                  !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
7068         }))
7069       continue;
7070 
7071     if (!TheLoop->contains(Op->getParent()))
7072       continue;
7073 
7074     // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
7075     // which applies for both scalar and vector versions. Otherwise it is only
7076     // dead in vector versions, so only add it to VecValuesToIgnore.
7077     if (all_of(Op->users(),
7078                [this](User *U) { return ValuesToIgnore.contains(U); }))
7079       ValuesToIgnore.insert(Op);
7080 
7081     VecValuesToIgnore.insert(Op);
7082     DeadOps.append(Op->op_begin(), Op->op_end());
7083   }
7084 
7085   // Ignore type-promoting instructions we identified during reduction
7086   // detection.
7087   for (const auto &Reduction : Legal->getReductionVars()) {
7088     const RecurrenceDescriptor &RedDes = Reduction.second;
7089     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7090     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7091   }
7092   // Ignore type-casting instructions we identified during induction
7093   // detection.
7094   for (const auto &Induction : Legal->getInductionVars()) {
7095     const InductionDescriptor &IndDes = Induction.second;
7096     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7097     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7098   }
7099 }
7100 
7101 void LoopVectorizationCostModel::collectInLoopReductions() {
7102   for (const auto &Reduction : Legal->getReductionVars()) {
7103     PHINode *Phi = Reduction.first;
7104     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7105 
7106     // We don't collect reductions that are type promoted (yet).
7107     if (RdxDesc.getRecurrenceType() != Phi->getType())
7108       continue;
7109 
7110     // If the target would prefer this reduction to happen "in-loop", then we
7111     // want to record it as such.
7112     unsigned Opcode = RdxDesc.getOpcode();
7113     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7114         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7115                                    TargetTransformInfo::ReductionFlags()))
7116       continue;
7117 
7118     // Check that we can correctly put the reductions into the loop, by
7119     // finding the chain of operations that leads from the phi to the loop
7120     // exit value.
7121     SmallVector<Instruction *, 4> ReductionOperations =
7122         RdxDesc.getReductionOpChain(Phi, TheLoop);
7123     bool InLoop = !ReductionOperations.empty();
7124 
7125     if (InLoop) {
7126       InLoopReductions.insert(Phi);
7127       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7128       Instruction *LastChain = Phi;
7129       for (auto *I : ReductionOperations) {
7130         InLoopReductionImmediateChains[I] = LastChain;
7131         LastChain = I;
7132       }
7133     }
7134     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7135                       << " reduction for phi: " << *Phi << "\n");
7136   }
7137 }
7138 
7139 // This function will select a scalable VF if the target supports scalable
7140 // vectors and a fixed one otherwise.
7141 // TODO: we could return a pair of values that specify the max VF and
7142 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7143 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7144 // doesn't have a cost model that can choose which plan to execute if
7145 // more than one is generated.
7146 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7147                                      LoopVectorizationCostModel &CM) {
7148   unsigned WidestType;
7149   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7150 
7151   TargetTransformInfo::RegisterKind RegKind =
7152       TTI.enableScalableVectorization()
7153           ? TargetTransformInfo::RGK_ScalableVector
7154           : TargetTransformInfo::RGK_FixedWidthVector;
7155 
7156   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7157   unsigned N = RegSize.getKnownMinValue() / WidestType;
7158   return ElementCount::get(N, RegSize.isScalable());
7159 }
7160 
7161 VectorizationFactor
7162 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7163   ElementCount VF = UserVF;
7164   // Outer loop handling: They may require CFG and instruction level
7165   // transformations before even evaluating whether vectorization is profitable.
7166   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7167   // the vectorization pipeline.
7168   if (!OrigLoop->isInnermost()) {
7169     // If the user doesn't provide a vectorization factor, determine a
7170     // reasonable one.
7171     if (UserVF.isZero()) {
7172       VF = determineVPlanVF(TTI, CM);
7173       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7174 
7175       // Make sure we have a VF > 1 for stress testing.
7176       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7177         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7178                           << "overriding computed VF.\n");
7179         VF = ElementCount::getFixed(4);
7180       }
7181     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7182                !ForceTargetSupportsScalableVectors) {
7183       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7184                         << "not supported by the target.\n");
7185       reportVectorizationFailure(
7186           "Scalable vectorization requested but not supported by the target",
7187           "the scalable user-specified vectorization width for outer-loop "
7188           "vectorization cannot be used because the target does not support "
7189           "scalable vectors.",
7190           "ScalableVFUnfeasible", ORE, OrigLoop);
7191       return VectorizationFactor::Disabled();
7192     }
7193     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7194     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7195            "VF needs to be a power of two");
7196     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7197                       << "VF " << VF << " to build VPlans.\n");
7198     buildVPlans(VF, VF);
7199 
7200     // For VPlan build stress testing, we bail out after VPlan construction.
7201     if (VPlanBuildStressTest)
7202       return VectorizationFactor::Disabled();
7203 
7204     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7205   }
7206 
7207   LLVM_DEBUG(
7208       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7209                 "VPlan-native path.\n");
7210   return VectorizationFactor::Disabled();
7211 }
7212 
7213 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7214   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7215   CM.collectValuesToIgnore();
7216   CM.collectElementTypesForWidening();
7217 
7218   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7219   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7220     return;
7221 
7222   // Invalidate interleave groups if all blocks of loop will be predicated.
7223   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7224       !useMaskedInterleavedAccesses(TTI)) {
7225     LLVM_DEBUG(
7226         dbgs()
7227         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7228            "which requires masked-interleaved support.\n");
7229     if (CM.InterleaveInfo.invalidateGroups())
7230       // Invalidating interleave groups also requires invalidating all decisions
7231       // based on them, which includes widening decisions and uniform and scalar
7232       // values.
7233       CM.invalidateCostModelingDecisions();
7234   }
7235 
7236   if (CM.foldTailByMasking())
7237     Legal->prepareToFoldTailByMasking();
7238 
7239   ElementCount MaxUserVF =
7240       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7241   if (UserVF) {
7242     if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7243       reportVectorizationInfo(
7244           "UserVF ignored because it may be larger than the maximal safe VF",
7245           "InvalidUserVF", ORE, OrigLoop);
7246     } else {
7247       assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7248              "VF needs to be a power of two");
7249       // Collect the instructions (and their associated costs) that will be more
7250       // profitable to scalarize.
7251       CM.collectInLoopReductions();
7252       if (CM.selectUserVectorizationFactor(UserVF)) {
7253         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7254         buildVPlansWithVPRecipes(UserVF, UserVF);
7255         LLVM_DEBUG(printPlans(dbgs()));
7256         return;
7257       }
7258       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7259                               "InvalidCost", ORE, OrigLoop);
7260     }
7261   }
7262 
7263   // Collect the Vectorization Factor Candidates.
7264   SmallVector<ElementCount> VFCandidates;
7265   for (auto VF = ElementCount::getFixed(1);
7266        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7267     VFCandidates.push_back(VF);
7268   for (auto VF = ElementCount::getScalable(1);
7269        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7270     VFCandidates.push_back(VF);
7271 
7272   CM.collectInLoopReductions();
7273   for (const auto &VF : VFCandidates) {
7274     // Collect Uniform and Scalar instructions after vectorization with VF.
7275     CM.collectUniformsAndScalars(VF);
7276 
7277     // Collect the instructions (and their associated costs) that will be more
7278     // profitable to scalarize.
7279     if (VF.isVector())
7280       CM.collectInstsToScalarize(VF);
7281   }
7282 
7283   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7284   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7285 
7286   LLVM_DEBUG(printPlans(dbgs()));
7287 }
7288 
7289 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7290                                              ElementCount VF) const {
7291   if (ForceTargetInstructionCost.getNumOccurrences())
7292     return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7293   return CM.getInstructionCost(UI, VF);
7294 }
7295 
7296 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7297   return CM.ValuesToIgnore.contains(UI) ||
7298          (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7299          SkipCostComputation.contains(UI);
7300 }
7301 
7302 InstructionCost
7303 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7304                                           VPCostContext &CostCtx) const {
7305   InstructionCost Cost;
7306   // Cost modeling for inductions is inaccurate in the legacy cost model
7307   // compared to the recipes that are generated. To match here initially during
7308   // VPlan cost model bring up directly use the induction costs from the legacy
7309   // cost model. Note that we do this as pre-processing; the VPlan may not have
7310   // any recipes associated with the original induction increment instruction
7311   // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7312   // the cost of induction phis and increments (both that are represented by
7313   // recipes and those that are not), to avoid distinguishing between them here,
7314   // and skip all recipes that represent induction phis and increments (the
7315   // former case) later on, if they exist, to avoid counting them twice.
7316   // Similarly we pre-compute the cost of any optimized truncates.
7317   // TODO: Switch to more accurate costing based on VPlan.
7318   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7319     Instruction *IVInc = cast<Instruction>(
7320         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7321     SmallVector<Instruction *> IVInsts = {IVInc};
7322     for (unsigned I = 0; I != IVInsts.size(); I++) {
7323       for (Value *Op : IVInsts[I]->operands()) {
7324         auto *OpI = dyn_cast<Instruction>(Op);
7325         if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7326           continue;
7327         IVInsts.push_back(OpI);
7328       }
7329     }
7330     IVInsts.push_back(IV);
7331     for (User *U : IV->users()) {
7332       auto *CI = cast<Instruction>(U);
7333       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7334         continue;
7335       IVInsts.push_back(CI);
7336     }
7337 
7338     // If the vector loop gets executed exactly once with the given VF, ignore
7339     // the costs of comparison and induction instructions, as they'll get
7340     // simplified away.
7341     // TODO: Remove this code after stepping away from the legacy cost model and
7342     // adding code to simplify VPlans before calculating their costs.
7343     auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7344     if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7345       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
7346                                            CostCtx.SkipCostComputation);
7347 
7348     for (Instruction *IVInst : IVInsts) {
7349       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7350         continue;
7351       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7352       LLVM_DEBUG({
7353         dbgs() << "Cost of " << InductionCost << " for VF " << VF
7354                << ": induction instruction " << *IVInst << "\n";
7355       });
7356       Cost += InductionCost;
7357       CostCtx.SkipCostComputation.insert(IVInst);
7358     }
7359   }
7360 
7361   /// Compute the cost of all exiting conditions of the loop using the legacy
7362   /// cost model. This is to match the legacy behavior, which adds the cost of
7363   /// all exit conditions. Note that this over-estimates the cost, as there will
7364   /// be a single condition to control the vector loop.
7365   SmallVector<BasicBlock *> Exiting;
7366   CM.TheLoop->getExitingBlocks(Exiting);
7367   SetVector<Instruction *> ExitInstrs;
7368   // Collect all exit conditions.
7369   for (BasicBlock *EB : Exiting) {
7370     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7371     if (!Term)
7372       continue;
7373     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7374       ExitInstrs.insert(CondI);
7375     }
7376   }
7377   // Compute the cost of all instructions only feeding the exit conditions.
7378   for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7379     Instruction *CondI = ExitInstrs[I];
7380     if (!OrigLoop->contains(CondI) ||
7381         !CostCtx.SkipCostComputation.insert(CondI).second)
7382       continue;
7383     InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7384     LLVM_DEBUG({
7385       dbgs() << "Cost of " << CondICost << " for VF " << VF
7386              << ": exit condition instruction " << *CondI << "\n";
7387     });
7388     Cost += CondICost;
7389     for (Value *Op : CondI->operands()) {
7390       auto *OpI = dyn_cast<Instruction>(Op);
7391       if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7392             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7393                    !ExitInstrs.contains(cast<Instruction>(U));
7394           }))
7395         continue;
7396       ExitInstrs.insert(OpI);
7397     }
7398   }
7399 
7400   // The legacy cost model has special logic to compute the cost of in-loop
7401   // reductions, which may be smaller than the sum of all instructions involved
7402   // in the reduction.
7403   // TODO: Switch to costing based on VPlan once the logic has been ported.
7404   for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7405     if (ForceTargetInstructionCost.getNumOccurrences())
7406       continue;
7407 
7408     if (!CM.isInLoopReduction(RedPhi))
7409       continue;
7410 
7411     const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7412     SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7413                                                  ChainOps.end());
7414     auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7415       return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7416     };
7417     // Also include the operands of instructions in the chain, as the cost-model
7418     // may mark extends as free.
7419     //
7420     // For ARM, some of the instruction can folded into the reducion
7421     // instruction. So we need to mark all folded instructions free.
7422     // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7423     // instruction.
7424     for (auto *ChainOp : ChainOps) {
7425       for (Value *Op : ChainOp->operands()) {
7426         if (auto *I = dyn_cast<Instruction>(Op)) {
7427           ChainOpsAndOperands.insert(I);
7428           if (I->getOpcode() == Instruction::Mul) {
7429             auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7430             auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7431             if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7432                 Ext0->getOpcode() == Ext1->getOpcode()) {
7433               ChainOpsAndOperands.insert(Ext0);
7434               ChainOpsAndOperands.insert(Ext1);
7435             }
7436           }
7437         }
7438       }
7439     }
7440 
7441     // Pre-compute the cost for I, if it has a reduction pattern cost.
7442     for (Instruction *I : ChainOpsAndOperands) {
7443       auto ReductionCost = CM.getReductionPatternCost(
7444           I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7445       if (!ReductionCost)
7446         continue;
7447 
7448       assert(!CostCtx.SkipCostComputation.contains(I) &&
7449              "reduction op visited multiple times");
7450       CostCtx.SkipCostComputation.insert(I);
7451       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7452                         << ":\n in-loop reduction " << *I << "\n");
7453       Cost += *ReductionCost;
7454     }
7455   }
7456 
7457   // Pre-compute the costs for branches except for the backedge, as the number
7458   // of replicate regions in a VPlan may not directly match the number of
7459   // branches, which would lead to different decisions.
7460   // TODO: Compute cost of branches for each replicate region in the VPlan,
7461   // which is more accurate than the legacy cost model.
7462   for (BasicBlock *BB : OrigLoop->blocks()) {
7463     if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7464       continue;
7465     CostCtx.SkipCostComputation.insert(BB->getTerminator());
7466     if (BB == OrigLoop->getLoopLatch())
7467       continue;
7468     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7469     Cost += BranchCost;
7470   }
7471 
7472   // Pre-compute costs for instructions that are forced-scalar or profitable to
7473   // scalarize. Their costs will be computed separately in the legacy cost
7474   // model.
7475   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7476     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7477       continue;
7478     CostCtx.SkipCostComputation.insert(ForcedScalar);
7479     InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7480     LLVM_DEBUG({
7481       dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7482              << ": forced scalar " << *ForcedScalar << "\n";
7483     });
7484     Cost += ForcedCost;
7485   }
7486   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7487     if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7488       continue;
7489     CostCtx.SkipCostComputation.insert(Scalarized);
7490     LLVM_DEBUG({
7491       dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7492              << ": profitable to scalarize " << *Scalarized << "\n";
7493     });
7494     Cost += ScalarCost;
7495   }
7496 
7497   return Cost;
7498 }
7499 
7500 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7501                                                ElementCount VF) const {
7502   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7503   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7504 
7505   // Now compute and add the VPlan-based cost.
7506   Cost += Plan.cost(VF, CostCtx);
7507 #ifndef NDEBUG
7508   unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7509   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7510                     << " (Estimated cost per lane: ");
7511   if (Cost.isValid()) {
7512     double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7513     LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7514   } else /* No point dividing an invalid cost - it will still be invalid */
7515     LLVM_DEBUG(dbgs() << "Invalid");
7516   LLVM_DEBUG(dbgs() << ")\n");
7517 #endif
7518   return Cost;
7519 }
7520 
7521 #ifndef NDEBUG
7522 /// Return true if the original loop \ TheLoop contains any instructions that do
7523 /// not have corresponding recipes in \p Plan and are not marked to be ignored
7524 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
7525 /// cost-model did not account for.
7526 static bool planContainsAdditionalSimplifications(VPlan &Plan,
7527                                                   VPCostContext &CostCtx,
7528                                                   Loop *TheLoop) {
7529   // First collect all instructions for the recipes in Plan.
7530   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7531     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7532       return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7533     if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7534       return &WidenMem->getIngredient();
7535     return nullptr;
7536   };
7537 
7538   DenseSet<Instruction *> SeenInstrs;
7539   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7540   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7541     for (VPRecipeBase &R : *VPBB) {
7542       if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7543         auto *IG = IR->getInterleaveGroup();
7544         unsigned NumMembers = IG->getNumMembers();
7545         for (unsigned I = 0; I != NumMembers; ++I) {
7546           if (Instruction *M = IG->getMember(I))
7547             SeenInstrs.insert(M);
7548         }
7549         continue;
7550       }
7551       // The VPlan-based cost model is more accurate for partial reduction and
7552       // comparing against the legacy cost isn't desirable.
7553       if (isa<VPPartialReductionRecipe>(&R))
7554         return true;
7555       if (Instruction *UI = GetInstructionForCost(&R))
7556         SeenInstrs.insert(UI);
7557     }
7558   }
7559 
7560   // Return true if the loop contains any instructions that are not also part of
7561   // the VPlan or are skipped for VPlan-based cost computations. This indicates
7562   // that the VPlan contains extra simplifications.
7563   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7564                                     TheLoop](BasicBlock *BB) {
7565     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7566       if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7567         return false;
7568       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7569     });
7570   });
7571 }
7572 #endif
7573 
7574 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7575   if (VPlans.empty())
7576     return VectorizationFactor::Disabled();
7577   // If there is a single VPlan with a single VF, return it directly.
7578   VPlan &FirstPlan = *VPlans[0];
7579   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7580     return {*FirstPlan.vectorFactors().begin(), 0, 0};
7581 
7582   ElementCount ScalarVF = ElementCount::getFixed(1);
7583   assert(hasPlanWithVF(ScalarVF) &&
7584          "More than a single plan/VF w/o any plan having scalar VF");
7585 
7586   // TODO: Compute scalar cost using VPlan-based cost model.
7587   InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7588   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7589   VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7590   VectorizationFactor BestFactor = ScalarFactor;
7591 
7592   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7593   if (ForceVectorization) {
7594     // Ignore scalar width, because the user explicitly wants vectorization.
7595     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7596     // evaluation.
7597     BestFactor.Cost = InstructionCost::getMax();
7598   }
7599 
7600   for (auto &P : VPlans) {
7601     for (ElementCount VF : P->vectorFactors()) {
7602       if (VF.isScalar())
7603         continue;
7604       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7605         LLVM_DEBUG(
7606             dbgs()
7607             << "LV: Not considering vector loop of width " << VF
7608             << " because it will not generate any vector instructions.\n");
7609         continue;
7610       }
7611 
7612       InstructionCost Cost = cost(*P, VF);
7613       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7614       if (isMoreProfitable(CurrentFactor, BestFactor))
7615         BestFactor = CurrentFactor;
7616 
7617       // If profitable add it to ProfitableVF list.
7618       if (isMoreProfitable(CurrentFactor, ScalarFactor))
7619         ProfitableVFs.push_back(CurrentFactor);
7620     }
7621   }
7622 
7623 #ifndef NDEBUG
7624   // Select the optimal vectorization factor according to the legacy cost-model.
7625   // This is now only used to verify the decisions by the new VPlan-based
7626   // cost-model and will be retired once the VPlan-based cost-model is
7627   // stabilized.
7628   VectorizationFactor LegacyVF = selectVectorizationFactor();
7629   VPlan &BestPlan = getPlanFor(BestFactor.Width);
7630 
7631   // Pre-compute the cost and use it to check if BestPlan contains any
7632   // simplifications not accounted for in the legacy cost model. If that's the
7633   // case, don't trigger the assertion, as the extra simplifications may cause a
7634   // different VF to be picked by the VPlan-based cost model.
7635   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7636   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7637   assert((BestFactor.Width == LegacyVF.Width ||
7638           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7639                                                 CostCtx, OrigLoop) ||
7640           planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
7641                                                 CostCtx, OrigLoop)) &&
7642          " VPlan cost model and legacy cost model disagreed");
7643   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7644          "when vectorizing, the scalar cost must be computed.");
7645 #endif
7646 
7647   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7648   return BestFactor;
7649 }
7650 
7651 static void addRuntimeUnrollDisableMetaData(Loop *L) {
7652   SmallVector<Metadata *, 4> MDs;
7653   // Reserve first location for self reference to the LoopID metadata node.
7654   MDs.push_back(nullptr);
7655   bool IsUnrollMetadata = false;
7656   MDNode *LoopID = L->getLoopID();
7657   if (LoopID) {
7658     // First find existing loop unrolling disable metadata.
7659     for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7660       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7661       if (MD) {
7662         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7663         IsUnrollMetadata =
7664             S && S->getString().starts_with("llvm.loop.unroll.disable");
7665       }
7666       MDs.push_back(LoopID->getOperand(I));
7667     }
7668   }
7669 
7670   if (!IsUnrollMetadata) {
7671     // Add runtime unroll disable metadata.
7672     LLVMContext &Context = L->getHeader()->getContext();
7673     SmallVector<Metadata *, 1> DisableOperands;
7674     DisableOperands.push_back(
7675         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7676     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7677     MDs.push_back(DisableNode);
7678     MDNode *NewLoopID = MDNode::get(Context, MDs);
7679     // Set operand 0 to refer to the loop id itself.
7680     NewLoopID->replaceOperandWith(0, NewLoopID);
7681     L->setLoopID(NewLoopID);
7682   }
7683 }
7684 
7685 // If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7686 // fix the reduction's scalar PHI node by adding the incoming value from the
7687 // main vector loop.
7688 static void fixReductionScalarResumeWhenVectorizingEpilog(
7689     VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7690     BasicBlock *BypassBlock) {
7691   auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7692   if (!EpiRedResult ||
7693       EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7694     return;
7695 
7696   auto *EpiRedHeaderPhi =
7697       cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7698   const RecurrenceDescriptor &RdxDesc =
7699       EpiRedHeaderPhi->getRecurrenceDescriptor();
7700   Value *MainResumeValue =
7701       EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7702   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7703           RdxDesc.getRecurrenceKind())) {
7704     auto *Cmp = cast<ICmpInst>(MainResumeValue);
7705     assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7706            "AnyOf expected to start with ICMP_NE");
7707     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7708            "AnyOf expected to start by comparing main resume value to original "
7709            "start value");
7710     MainResumeValue = Cmp->getOperand(0);
7711   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
7712                  RdxDesc.getRecurrenceKind())) {
7713     using namespace llvm::PatternMatch;
7714     Value *Cmp, *OrigResumeV;
7715     bool IsExpectedPattern =
7716         match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7717                                         m_Specific(RdxDesc.getSentinelValue()),
7718                                         m_Value(OrigResumeV))) &&
7719         match(Cmp,
7720               m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7721                              m_Specific(RdxDesc.getRecurrenceStartValue())));
7722     assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7723     (void)IsExpectedPattern;
7724     MainResumeValue = OrigResumeV;
7725   }
7726   PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7727 
7728   // When fixing reductions in the epilogue loop we should already have
7729   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7730   // over the incoming values correctly.
7731   using namespace VPlanPatternMatch;
7732   auto IsResumePhi = [](VPUser *U) {
7733     return match(
7734         U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7735   };
7736   assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7737          "ResumePhi must have a single user");
7738   auto *EpiResumePhiVPI =
7739       cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7740   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7741   EpiResumePhi->setIncomingValueForBlock(
7742       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7743 }
7744 
7745 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7746     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7747     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7748     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7749   assert(BestVPlan.hasVF(BestVF) &&
7750          "Trying to execute plan with unsupported VF");
7751   assert(BestVPlan.hasUF(BestUF) &&
7752          "Trying to execute plan with unsupported UF");
7753   assert(
7754       ((VectorizingEpilogue && ExpandedSCEVs) ||
7755        (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7756       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7757 
7758   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7759   // cost model is complete for better cost estimates.
7760   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7761                               OrigLoop->getHeader()->getContext());
7762   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7763   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
7764 
7765   // Perform the actual loop transformation.
7766   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7767                          &BestVPlan, OrigLoop->getParentLoop(),
7768                          Legal->getWidestInductionType());
7769 
7770 #ifdef EXPENSIVE_CHECKS
7771   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7772 #endif
7773 
7774   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7775   // making any changes to the CFG.
7776   if (!BestVPlan.getEntry()->empty())
7777     BestVPlan.getEntry()->execute(&State);
7778 
7779   if (!ILV.getTripCount())
7780     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7781   else
7782     assert(VectorizingEpilogue && "should only re-use the existing trip "
7783                                   "count during epilogue vectorization");
7784 
7785   // 1. Set up the skeleton for vectorization, including vector pre-header and
7786   // middle block. The vector loop is created during VPlan execution.
7787   VPBasicBlock *VectorPH =
7788       cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7789   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7790       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7791   if (VectorizingEpilogue)
7792     VPlanTransforms::removeDeadRecipes(BestVPlan);
7793 
7794   // Only use noalias metadata when using memory checks guaranteeing no overlap
7795   // across all iterations.
7796   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7797   std::unique_ptr<LoopVersioning> LVer = nullptr;
7798   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7799       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7800 
7801     //  We currently don't use LoopVersioning for the actual loop cloning but we
7802     //  still use it to add the noalias metadata.
7803     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7804     //        metadata.
7805     LVer = std::make_unique<LoopVersioning>(
7806         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7807         PSE.getSE());
7808     State.LVer = &*LVer;
7809     State.LVer->prepareNoAliasMetadata();
7810   }
7811 
7812   ILV.printDebugTracesAtStart();
7813 
7814   //===------------------------------------------------===//
7815   //
7816   // Notice: any optimization or new instruction that go
7817   // into the code below should also be implemented in
7818   // the cost-model.
7819   //
7820   //===------------------------------------------------===//
7821 
7822   // 2. Copy and widen instructions from the old loop into the new loop.
7823   BestVPlan.prepareToExecute(
7824       ILV.getTripCount(),
7825       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7826   replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7827 
7828   BestVPlan.execute(&State);
7829 
7830   auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7831   // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7832   // values from the additional bypass block.
7833   if (VectorizingEpilogue) {
7834     assert(!ILV.Legal->hasUncountableEarlyExit() &&
7835            "Epilogue vectorisation not yet supported with early exits");
7836     BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7837     for (VPRecipeBase &R : *MiddleVPBB) {
7838       fixReductionScalarResumeWhenVectorizingEpilog(
7839           &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7840     }
7841     BasicBlock *PH = OrigLoop->getLoopPreheader();
7842     for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7843       auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7844       Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
7845       Inc->setIncomingValueForBlock(BypassBlock, V);
7846     }
7847   }
7848 
7849   // 2.6. Maintain Loop Hints
7850   // Keep all loop hints from the original loop on the vector loop (we'll
7851   // replace the vectorizer-specific hints below).
7852   if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7853     MDNode *OrigLoopID = OrigLoop->getLoopID();
7854 
7855     std::optional<MDNode *> VectorizedLoopID =
7856         makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7857                                         LLVMLoopVectorizeFollowupVectorized});
7858 
7859     VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7860     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7861     if (VectorizedLoopID) {
7862       L->setLoopID(*VectorizedLoopID);
7863     } else {
7864       // Keep all loop hints from the original loop on the vector loop (we'll
7865       // replace the vectorizer-specific hints below).
7866       if (MDNode *LID = OrigLoop->getLoopID())
7867         L->setLoopID(LID);
7868 
7869       LoopVectorizeHints Hints(L, true, *ORE);
7870       Hints.setAlreadyVectorized();
7871     }
7872     TargetTransformInfo::UnrollingPreferences UP;
7873     TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7874     if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7875       addRuntimeUnrollDisableMetaData(L);
7876   }
7877 
7878   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7879   //    predication, updating analyses.
7880   ILV.fixVectorizedLoop(State);
7881 
7882   ILV.printDebugTracesAtEnd();
7883 
7884   // 4. Adjust branch weight of the branch in the middle block.
7885   if (BestVPlan.getVectorLoopRegion()) {
7886     auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7887     auto *MiddleTerm =
7888         cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7889     if (MiddleTerm->isConditional() &&
7890         hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7891       // Assume that `Count % VectorTripCount` is equally distributed.
7892       unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7893       assert(TripCount > 0 && "trip count should not be zero");
7894       const uint32_t Weights[] = {1, TripCount - 1};
7895       setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7896     }
7897   }
7898 
7899   return State.ExpandedSCEVs;
7900 }
7901 
7902 //===--------------------------------------------------------------------===//
7903 // EpilogueVectorizerMainLoop
7904 //===--------------------------------------------------------------------===//
7905 
7906 /// This function is partially responsible for generating the control flow
7907 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7908 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7909     const SCEV2ValueTy &ExpandedSCEVs) {
7910   createVectorLoopSkeleton("");
7911 
7912   // Generate the code to check the minimum iteration count of the vector
7913   // epilogue (see below).
7914   EPI.EpilogueIterationCountCheck =
7915       emitIterationCountCheck(LoopScalarPreHeader, true);
7916   EPI.EpilogueIterationCountCheck->setName("iter.check");
7917 
7918   // Generate the code to check any assumptions that we've made for SCEV
7919   // expressions.
7920   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7921 
7922   // Generate the code that checks at runtime if arrays overlap. We put the
7923   // checks into a separate block to make the more common case of few elements
7924   // faster.
7925   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7926 
7927   // Generate the iteration count check for the main loop, *after* the check
7928   // for the epilogue loop, so that the path-length is shorter for the case
7929   // that goes directly through the vector epilogue. The longer-path length for
7930   // the main loop is compensated for, by the gain from vectorizing the larger
7931   // trip count. Note: the branch will get updated later on when we vectorize
7932   // the epilogue.
7933   EPI.MainLoopIterationCountCheck =
7934       emitIterationCountCheck(LoopScalarPreHeader, false);
7935 
7936   // Generate the induction variable.
7937   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7938 
7939   return LoopVectorPreHeader;
7940 }
7941 
7942 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7943   LLVM_DEBUG({
7944     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7945            << "Main Loop VF:" << EPI.MainLoopVF
7946            << ", Main Loop UF:" << EPI.MainLoopUF
7947            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7948            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7949   });
7950 }
7951 
7952 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7953   DEBUG_WITH_TYPE(VerboseDebug, {
7954     dbgs() << "intermediate fn:\n"
7955            << *OrigLoop->getHeader()->getParent() << "\n";
7956   });
7957 }
7958 
7959 BasicBlock *
7960 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7961                                                     bool ForEpilogue) {
7962   assert(Bypass && "Expected valid bypass basic block.");
7963   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7964   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7965   Value *Count = getTripCount();
7966   // Reuse existing vector loop preheader for TC checks.
7967   // Note that new preheader block is generated for vector loop.
7968   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7969   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7970 
7971   // Generate code to check if the loop's trip count is less than VF * UF of the
7972   // main vector loop.
7973   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7974                                                     : VF.isVector())
7975                ? ICmpInst::ICMP_ULE
7976                : ICmpInst::ICMP_ULT;
7977 
7978   Value *CheckMinIters = Builder.CreateICmp(
7979       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7980       "min.iters.check");
7981 
7982   if (!ForEpilogue)
7983     TCCheckBlock->setName("vector.main.loop.iter.check");
7984 
7985   // Create new preheader for vector loop.
7986   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7987                                    DT, LI, nullptr, "vector.ph");
7988 
7989   if (ForEpilogue) {
7990     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7991                                  DT->getNode(Bypass)->getIDom()) &&
7992            "TC check is expected to dominate Bypass");
7993 
7994     LoopBypassBlocks.push_back(TCCheckBlock);
7995 
7996     // Save the trip count so we don't have to regenerate it in the
7997     // vec.epilog.iter.check. This is safe to do because the trip count
7998     // generated here dominates the vector epilog iter check.
7999     EPI.TripCount = Count;
8000   }
8001 
8002   BranchInst &BI =
8003       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8004   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
8005     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
8006   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
8007 
8008   introduceCheckBlockInVPlan(TCCheckBlock);
8009   return TCCheckBlock;
8010 }
8011 
8012 //===--------------------------------------------------------------------===//
8013 // EpilogueVectorizerEpilogueLoop
8014 //===--------------------------------------------------------------------===//
8015 
8016 /// This function is partially responsible for generating the control flow
8017 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8018 BasicBlock *
8019 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
8020     const SCEV2ValueTy &ExpandedSCEVs) {
8021   createVectorLoopSkeleton("vec.epilog.");
8022 
8023   // Now, compare the remaining count and if there aren't enough iterations to
8024   // execute the vectorized epilogue skip to the scalar part.
8025   LoopVectorPreHeader->setName("vec.epilog.ph");
8026   BasicBlock *VecEpilogueIterationCountCheck =
8027       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
8028                  nullptr, "vec.epilog.iter.check", true);
8029   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
8030                                           VecEpilogueIterationCountCheck);
8031   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
8032 
8033   // Adjust the control flow taking the state info from the main loop
8034   // vectorization into account.
8035   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8036          "expected this to be saved from the previous pass.");
8037   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8038       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8039 
8040   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8041       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8042 
8043   if (EPI.SCEVSafetyCheck)
8044     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8045         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8046   if (EPI.MemSafetyCheck)
8047     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8048         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8049 
8050   DT->changeImmediateDominator(LoopScalarPreHeader,
8051                                EPI.EpilogueIterationCountCheck);
8052   // Keep track of bypass blocks, as they feed start values to the induction and
8053   // reduction phis in the scalar loop preheader.
8054   if (EPI.SCEVSafetyCheck)
8055     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8056   if (EPI.MemSafetyCheck)
8057     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8058   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8059 
8060   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
8061   // reductions which merge control-flow from the latch block and the middle
8062   // block. Update the incoming values here and move the Phi into the preheader.
8063   SmallVector<PHINode *, 4> PhisInBlock;
8064   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8065     PhisInBlock.push_back(&Phi);
8066 
8067   for (PHINode *Phi : PhisInBlock) {
8068     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8069     Phi->replaceIncomingBlockWith(
8070         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8071         VecEpilogueIterationCountCheck);
8072 
8073     // If the phi doesn't have an incoming value from the
8074     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
8075     // value and also those from other check blocks. This is needed for
8076     // reduction phis only.
8077     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
8078           return EPI.EpilogueIterationCountCheck == IncB;
8079         }))
8080       continue;
8081     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8082     if (EPI.SCEVSafetyCheck)
8083       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8084     if (EPI.MemSafetyCheck)
8085       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8086   }
8087 
8088   // Generate bypass values from the additional bypass block. Note that when the
8089   // vectorized epilogue is skipped due to iteration count check, then the
8090   // resume value for the induction variable comes from the trip count of the
8091   // main vector loop, passed as the second argument.
8092   createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
8093   return LoopVectorPreHeader;
8094 }
8095 
8096 BasicBlock *
8097 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8098     BasicBlock *Bypass, BasicBlock *Insert) {
8099 
8100   assert(EPI.TripCount &&
8101          "Expected trip count to have been saved in the first pass.");
8102   assert(
8103       (!isa<Instruction>(EPI.TripCount) ||
8104        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8105       "saved trip count does not dominate insertion point.");
8106   Value *TC = EPI.TripCount;
8107   IRBuilder<> Builder(Insert->getTerminator());
8108   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8109 
8110   // Generate code to check if the loop's trip count is less than VF * UF of the
8111   // vector epilogue loop.
8112   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8113                ? ICmpInst::ICMP_ULE
8114                : ICmpInst::ICMP_ULT;
8115 
8116   Value *CheckMinIters =
8117       Builder.CreateICmp(P, Count,
8118                          createStepForVF(Builder, Count->getType(),
8119                                          EPI.EpilogueVF, EPI.EpilogueUF),
8120                          "min.epilog.iters.check");
8121 
8122   BranchInst &BI =
8123       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8124   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
8125     unsigned MainLoopStep = UF * VF.getKnownMinValue();
8126     unsigned EpilogueLoopStep =
8127         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
8128     // We assume the remaining `Count` is equally distributed in
8129     // [0, MainLoopStep)
8130     // So the probability for `Count < EpilogueLoopStep` should be
8131     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8132     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8133     const uint32_t Weights[] = {EstimatedSkipCount,
8134                                 MainLoopStep - EstimatedSkipCount};
8135     setBranchWeights(BI, Weights, /*IsExpected=*/false);
8136   }
8137   ReplaceInstWithInst(Insert->getTerminator(), &BI);
8138   LoopBypassBlocks.push_back(Insert);
8139 
8140   // A new entry block has been created for the epilogue VPlan. Hook it in, as
8141   // otherwise we would try to modify the entry to the main vector loop.
8142   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8143   VPBasicBlock *OldEntry = Plan.getEntry();
8144   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8145   Plan.setEntry(NewEntry);
8146   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8147 
8148   introduceCheckBlockInVPlan(Insert);
8149   return Insert;
8150 }
8151 
8152 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8153   LLVM_DEBUG({
8154     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8155            << "Epilogue Loop VF:" << EPI.EpilogueVF
8156            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8157   });
8158 }
8159 
8160 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8161   DEBUG_WITH_TYPE(VerboseDebug, {
8162     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8163   });
8164 }
8165 
8166 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8167 VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
8168   std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8169     return getVPValueOrAddLiveIn(Op);
8170   };
8171   return map_range(Operands, Fn);
8172 }
8173 
8174 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
8175   BasicBlock *Src = SI->getParent();
8176   assert(!OrigLoop->isLoopExiting(Src) &&
8177          all_of(successors(Src),
8178                 [this](BasicBlock *Succ) {
8179                   return OrigLoop->getHeader() != Succ;
8180                 }) &&
8181          "unsupported switch either exiting loop or continuing to header");
8182   // Create masks where the terminator in Src is a switch. We create mask for
8183   // all edges at the same time. This is more efficient, as we can create and
8184   // collect compares for all cases once.
8185   VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8186   BasicBlock *DefaultDst = SI->getDefaultDest();
8187   MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
8188   for (auto &C : SI->cases()) {
8189     BasicBlock *Dst = C.getCaseSuccessor();
8190     assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8191     // Cases whose destination is the same as default are redundant and can be
8192     // ignored - they will get there anyhow.
8193     if (Dst == DefaultDst)
8194       continue;
8195     auto &Compares = Dst2Compares[Dst];
8196     VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8197     Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8198   }
8199 
8200   // We need to handle 2 separate cases below for all entries in Dst2Compares,
8201   // which excludes destinations matching the default destination.
8202   VPValue *SrcMask = getBlockInMask(Src);
8203   VPValue *DefaultMask = nullptr;
8204   for (const auto &[Dst, Conds] : Dst2Compares) {
8205     // 1. Dst is not the default destination. Dst is reached if any of the cases
8206     // with destination == Dst are taken. Join the conditions for each case
8207     // whose destination == Dst using an OR.
8208     VPValue *Mask = Conds[0];
8209     for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8210       Mask = Builder.createOr(Mask, V);
8211     if (SrcMask)
8212       Mask = Builder.createLogicalAnd(SrcMask, Mask);
8213     EdgeMaskCache[{Src, Dst}] = Mask;
8214 
8215     // 2. Create the mask for the default destination, which is reached if none
8216     // of the cases with destination != default destination are taken. Join the
8217     // conditions for each case where the destination is != Dst using an OR and
8218     // negate it.
8219     DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8220   }
8221 
8222   if (DefaultMask) {
8223     DefaultMask = Builder.createNot(DefaultMask);
8224     if (SrcMask)
8225       DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8226   }
8227   EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8228 }
8229 
8230 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
8231   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8232 
8233   // Look for cached value.
8234   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8235   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8236   if (ECEntryIt != EdgeMaskCache.end())
8237     return ECEntryIt->second;
8238 
8239   if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8240     createSwitchEdgeMasks(SI);
8241     assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8242     return EdgeMaskCache[Edge];
8243   }
8244 
8245   VPValue *SrcMask = getBlockInMask(Src);
8246 
8247   // The terminator has to be a branch inst!
8248   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8249   assert(BI && "Unexpected terminator found");
8250   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8251     return EdgeMaskCache[Edge] = SrcMask;
8252 
8253   // If source is an exiting block, we know the exit edge is dynamically dead
8254   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8255   // adding uses of an otherwise potentially dead instruction unless we are
8256   // vectorizing a loop with uncountable exits. In that case, we always
8257   // materialize the mask.
8258   if (OrigLoop->isLoopExiting(Src) &&
8259       Src != Legal->getUncountableEarlyExitingBlock())
8260     return EdgeMaskCache[Edge] = SrcMask;
8261 
8262   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8263   assert(EdgeMask && "No Edge Mask found for condition");
8264 
8265   if (BI->getSuccessor(0) != Dst)
8266     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8267 
8268   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8269     // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8270     // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8271     // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8272     EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8273   }
8274 
8275   return EdgeMaskCache[Edge] = EdgeMask;
8276 }
8277 
8278 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
8279   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8280 
8281   // Look for cached value.
8282   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8283   EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8284   assert(ECEntryIt != EdgeMaskCache.end() &&
8285          "looking up mask for edge which has not been created");
8286   return ECEntryIt->second;
8287 }
8288 
8289 void VPRecipeBuilder::createHeaderMask() {
8290   BasicBlock *Header = OrigLoop->getHeader();
8291 
8292   // When not folding the tail, use nullptr to model all-true mask.
8293   if (!CM.foldTailByMasking()) {
8294     BlockMaskCache[Header] = nullptr;
8295     return;
8296   }
8297 
8298   // Introduce the early-exit compare IV <= BTC to form header block mask.
8299   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8300   // constructing the desired canonical IV in the header block as its first
8301   // non-phi instructions.
8302 
8303   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8304   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8305   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8306   HeaderVPBB->insert(IV, NewInsertionPoint);
8307 
8308   VPBuilder::InsertPointGuard Guard(Builder);
8309   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8310   VPValue *BlockMask = nullptr;
8311   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8312   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8313   BlockMaskCache[Header] = BlockMask;
8314 }
8315 
8316 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8317   // Return the cached value.
8318   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8319   assert(BCEntryIt != BlockMaskCache.end() &&
8320          "Trying to access mask for block without one.");
8321   return BCEntryIt->second;
8322 }
8323 
8324 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
8325   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8326   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8327   assert(OrigLoop->getHeader() != BB &&
8328          "Loop header must have cached block mask");
8329 
8330   // All-one mask is modelled as no-mask following the convention for masked
8331   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8332   VPValue *BlockMask = nullptr;
8333   // This is the block mask. We OR all unique incoming edges.
8334   for (auto *Predecessor :
8335        SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) {
8336     VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8337     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8338       BlockMaskCache[BB] = EdgeMask;
8339       return;
8340     }
8341 
8342     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8343       BlockMask = EdgeMask;
8344       continue;
8345     }
8346 
8347     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8348   }
8349 
8350   BlockMaskCache[BB] = BlockMask;
8351 }
8352 
8353 VPWidenMemoryRecipe *
8354 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8355                                   VFRange &Range) {
8356   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8357          "Must be called with either a load or store");
8358 
8359   auto WillWiden = [&](ElementCount VF) -> bool {
8360     LoopVectorizationCostModel::InstWidening Decision =
8361         CM.getWideningDecision(I, VF);
8362     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8363            "CM decision should be taken at this point.");
8364     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8365       return true;
8366     if (CM.isScalarAfterVectorization(I, VF) ||
8367         CM.isProfitableToScalarize(I, VF))
8368       return false;
8369     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8370   };
8371 
8372   if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
8373     return nullptr;
8374 
8375   VPValue *Mask = nullptr;
8376   if (Legal->isMaskRequired(I))
8377     Mask = getBlockInMask(I->getParent());
8378 
8379   // Determine if the pointer operand of the access is either consecutive or
8380   // reverse consecutive.
8381   LoopVectorizationCostModel::InstWidening Decision =
8382       CM.getWideningDecision(I, Range.Start);
8383   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8384   bool Consecutive =
8385       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8386 
8387   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8388   if (Consecutive) {
8389     auto *GEP = dyn_cast<GetElementPtrInst>(
8390         Ptr->getUnderlyingValue()->stripPointerCasts());
8391     VPSingleDefRecipe *VectorPtr;
8392     if (Reverse) {
8393       // When folding the tail, we may compute an address that we don't in the
8394       // original scalar loop and it may not be inbounds. Drop Inbounds in that
8395       // case.
8396       GEPNoWrapFlags Flags =
8397           (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8398               ? GEPNoWrapFlags::none()
8399               : GEPNoWrapFlags::inBounds();
8400       VectorPtr = new VPReverseVectorPointerRecipe(
8401           Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8402     } else {
8403       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8404                                             GEP ? GEP->getNoWrapFlags()
8405                                                 : GEPNoWrapFlags::none(),
8406                                             I->getDebugLoc());
8407     }
8408     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8409     Ptr = VectorPtr;
8410   }
8411   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8412     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8413                                  I->getDebugLoc());
8414 
8415   StoreInst *Store = cast<StoreInst>(I);
8416   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8417                                 Reverse, I->getDebugLoc());
8418 }
8419 
8420 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8421 /// insert a recipe to expand the step for the induction recipe.
8422 static VPWidenIntOrFpInductionRecipe *
8423 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8424                             VPValue *Start, const InductionDescriptor &IndDesc,
8425                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8426   assert(IndDesc.getStartValue() ==
8427          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8428   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8429          "step must be loop invariant");
8430 
8431   VPValue *Step =
8432       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8433   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8434     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8435                                              IndDesc, TruncI,
8436                                              TruncI->getDebugLoc());
8437   }
8438   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8439   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8440                                            IndDesc, Phi->getDebugLoc());
8441 }
8442 
8443 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8444     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
8445 
8446   // Check if this is an integer or fp induction. If so, build the recipe that
8447   // produces its scalar and vector values.
8448   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8449     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8450                                        *PSE.getSE(), *OrigLoop);
8451 
8452   // Check if this is pointer induction. If so, build the recipe for it.
8453   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8454     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8455                                                            *PSE.getSE());
8456     return new VPWidenPointerInductionRecipe(
8457         Phi, Operands[0], Step, *II,
8458         LoopVectorizationPlanner::getDecisionAndClampRange(
8459             [&](ElementCount VF) {
8460               return CM.isScalarAfterVectorization(Phi, VF);
8461             },
8462             Range),
8463         Phi->getDebugLoc());
8464   }
8465   return nullptr;
8466 }
8467 
8468 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8469     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
8470   // Optimize the special case where the source is a constant integer
8471   // induction variable. Notice that we can only optimize the 'trunc' case
8472   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8473   // (c) other casts depend on pointer size.
8474 
8475   // Determine whether \p K is a truncation based on an induction variable that
8476   // can be optimized.
8477   auto IsOptimizableIVTruncate =
8478       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8479     return [=](ElementCount VF) -> bool {
8480       return CM.isOptimizableIVTruncate(K, VF);
8481     };
8482   };
8483 
8484   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8485           IsOptimizableIVTruncate(I), Range)) {
8486 
8487     auto *Phi = cast<PHINode>(I->getOperand(0));
8488     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8489     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8490     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8491                                        *OrigLoop);
8492   }
8493   return nullptr;
8494 }
8495 
8496 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8497                                            ArrayRef<VPValue *> Operands) {
8498   unsigned NumIncoming = Phi->getNumIncomingValues();
8499 
8500   // We know that all PHIs in non-header blocks are converted into selects, so
8501   // we don't have to worry about the insertion order and we can just use the
8502   // builder. At this point we generate the predication tree. There may be
8503   // duplications since this is a simple recursive scan, but future
8504   // optimizations will clean it up.
8505   SmallVector<VPValue *, 2> OperandsWithMask;
8506 
8507   for (unsigned In = 0; In < NumIncoming; In++) {
8508     OperandsWithMask.push_back(Operands[In]);
8509     VPValue *EdgeMask =
8510         getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8511     if (!EdgeMask) {
8512       assert(In == 0 && "Both null and non-null edge masks found");
8513       assert(all_equal(Operands) &&
8514              "Distinct incoming values with one having a full mask");
8515       break;
8516     }
8517     OperandsWithMask.push_back(EdgeMask);
8518   }
8519   return new VPBlendRecipe(Phi, OperandsWithMask);
8520 }
8521 
8522 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8523                                                    ArrayRef<VPValue *> Operands,
8524                                                    VFRange &Range) {
8525   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8526       [this, CI](ElementCount VF) {
8527         return CM.isScalarWithPredication(CI, VF);
8528       },
8529       Range);
8530 
8531   if (IsPredicated)
8532     return nullptr;
8533 
8534   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8535   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8536              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8537              ID == Intrinsic::pseudoprobe ||
8538              ID == Intrinsic::experimental_noalias_scope_decl))
8539     return nullptr;
8540 
8541   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8542 
8543   // Is it beneficial to perform intrinsic call compared to lib call?
8544   bool ShouldUseVectorIntrinsic =
8545       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8546                 [&](ElementCount VF) -> bool {
8547                   return CM.getCallWideningDecision(CI, VF).Kind ==
8548                          LoopVectorizationCostModel::CM_IntrinsicCall;
8549                 },
8550                 Range);
8551   if (ShouldUseVectorIntrinsic)
8552     return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8553                                       CI->getDebugLoc());
8554 
8555   Function *Variant = nullptr;
8556   std::optional<unsigned> MaskPos;
8557   // Is better to call a vectorized version of the function than to to scalarize
8558   // the call?
8559   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8560       [&](ElementCount VF) -> bool {
8561         // The following case may be scalarized depending on the VF.
8562         // The flag shows whether we can use a usual Call for vectorized
8563         // version of the instruction.
8564 
8565         // If we've found a variant at a previous VF, then stop looking. A
8566         // vectorized variant of a function expects input in a certain shape
8567         // -- basically the number of input registers, the number of lanes
8568         // per register, and whether there's a mask required.
8569         // We store a pointer to the variant in the VPWidenCallRecipe, so
8570         // once we have an appropriate variant it's only valid for that VF.
8571         // This will force a different vplan to be generated for each VF that
8572         // finds a valid variant.
8573         if (Variant)
8574           return false;
8575         LoopVectorizationCostModel::CallWideningDecision Decision =
8576             CM.getCallWideningDecision(CI, VF);
8577         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8578           Variant = Decision.Variant;
8579           MaskPos = Decision.MaskPos;
8580           return true;
8581         }
8582 
8583         return false;
8584       },
8585       Range);
8586   if (ShouldUseVectorCall) {
8587     if (MaskPos.has_value()) {
8588       // We have 2 cases that would require a mask:
8589       //   1) The block needs to be predicated, either due to a conditional
8590       //      in the scalar loop or use of an active lane mask with
8591       //      tail-folding, and we use the appropriate mask for the block.
8592       //   2) No mask is required for the block, but the only available
8593       //      vector variant at this VF requires a mask, so we synthesize an
8594       //      all-true mask.
8595       VPValue *Mask = nullptr;
8596       if (Legal->isMaskRequired(CI))
8597         Mask = getBlockInMask(CI->getParent());
8598       else
8599         Mask = Plan.getOrAddLiveIn(
8600             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
8601 
8602       Ops.insert(Ops.begin() + *MaskPos, Mask);
8603     }
8604 
8605     Ops.push_back(Operands.back());
8606     return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8607   }
8608 
8609   return nullptr;
8610 }
8611 
8612 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8613   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8614          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8615   // Instruction should be widened, unless it is scalar after vectorization,
8616   // scalarization is profitable or it is predicated.
8617   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8618     return CM.isScalarAfterVectorization(I, VF) ||
8619            CM.isProfitableToScalarize(I, VF) ||
8620            CM.isScalarWithPredication(I, VF);
8621   };
8622   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8623                                                              Range);
8624 }
8625 
8626 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8627                                            ArrayRef<VPValue *> Operands,
8628                                            VPBasicBlock *VPBB) {
8629   switch (I->getOpcode()) {
8630   default:
8631     return nullptr;
8632   case Instruction::SDiv:
8633   case Instruction::UDiv:
8634   case Instruction::SRem:
8635   case Instruction::URem: {
8636     // If not provably safe, use a select to form a safe divisor before widening the
8637     // div/rem operation itself.  Otherwise fall through to general handling below.
8638     if (CM.isPredicatedInst(I)) {
8639       SmallVector<VPValue *> Ops(Operands);
8640       VPValue *Mask = getBlockInMask(I->getParent());
8641       VPValue *One =
8642           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8643       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8644       Ops[1] = SafeRHS;
8645       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8646     }
8647     [[fallthrough]];
8648   }
8649   case Instruction::Add:
8650   case Instruction::And:
8651   case Instruction::AShr:
8652   case Instruction::FAdd:
8653   case Instruction::FCmp:
8654   case Instruction::FDiv:
8655   case Instruction::FMul:
8656   case Instruction::FNeg:
8657   case Instruction::FRem:
8658   case Instruction::FSub:
8659   case Instruction::ICmp:
8660   case Instruction::LShr:
8661   case Instruction::Mul:
8662   case Instruction::Or:
8663   case Instruction::Select:
8664   case Instruction::Shl:
8665   case Instruction::Sub:
8666   case Instruction::Xor:
8667   case Instruction::Freeze:
8668     SmallVector<VPValue *> NewOps(Operands);
8669     if (Instruction::isBinaryOp(I->getOpcode())) {
8670       // The legacy cost model uses SCEV to check if some of the operands are
8671       // constants. To match the legacy cost model's behavior, use SCEV to try
8672       // to replace operands with constants.
8673       ScalarEvolution &SE = *PSE.getSE();
8674       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8675         Value *V = Op->getUnderlyingValue();
8676         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8677           return Op;
8678         auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8679         if (!C)
8680           return Op;
8681         return Plan.getOrAddLiveIn(C->getValue());
8682       };
8683       // For Mul, the legacy cost model checks both operands.
8684       if (I->getOpcode() == Instruction::Mul)
8685         NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8686       // For other binops, the legacy cost model only checks the second operand.
8687       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8688     }
8689     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8690   };
8691 }
8692 
8693 VPHistogramRecipe *
8694 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8695                                      ArrayRef<VPValue *> Operands) {
8696   // FIXME: Support other operations.
8697   unsigned Opcode = HI->Update->getOpcode();
8698   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8699          "Histogram update operation must be an Add or Sub");
8700 
8701   SmallVector<VPValue *, 3> HGramOps;
8702   // Bucket address.
8703   HGramOps.push_back(Operands[1]);
8704   // Increment value.
8705   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8706 
8707   // In case of predicated execution (due to tail-folding, or conditional
8708   // execution, or both), pass the relevant mask.
8709   if (Legal->isMaskRequired(HI->Store))
8710     HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8711 
8712   return new VPHistogramRecipe(Opcode,
8713                                make_range(HGramOps.begin(), HGramOps.end()),
8714                                HI->Store->getDebugLoc());
8715 }
8716 
8717 void VPRecipeBuilder::fixHeaderPhis() {
8718   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8719   for (VPHeaderPHIRecipe *R : PhisToFix) {
8720     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8721     VPRecipeBase *IncR =
8722         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8723     R->addOperand(IncR->getVPSingleValue());
8724   }
8725 }
8726 
8727 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8728                                                       VFRange &Range) {
8729   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8730       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8731       Range);
8732 
8733   bool IsPredicated = CM.isPredicatedInst(I);
8734 
8735   // Even if the instruction is not marked as uniform, there are certain
8736   // intrinsic calls that can be effectively treated as such, so we check for
8737   // them here. Conservatively, we only do this for scalable vectors, since
8738   // for fixed-width VFs we can always fall back on full scalarization.
8739   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8740     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8741     case Intrinsic::assume:
8742     case Intrinsic::lifetime_start:
8743     case Intrinsic::lifetime_end:
8744       // For scalable vectors if one of the operands is variant then we still
8745       // want to mark as uniform, which will generate one instruction for just
8746       // the first lane of the vector. We can't scalarize the call in the same
8747       // way as for fixed-width vectors because we don't know how many lanes
8748       // there are.
8749       //
8750       // The reasons for doing it this way for scalable vectors are:
8751       //   1. For the assume intrinsic generating the instruction for the first
8752       //      lane is still be better than not generating any at all. For
8753       //      example, the input may be a splat across all lanes.
8754       //   2. For the lifetime start/end intrinsics the pointer operand only
8755       //      does anything useful when the input comes from a stack object,
8756       //      which suggests it should always be uniform. For non-stack objects
8757       //      the effect is to poison the object, which still allows us to
8758       //      remove the call.
8759       IsUniform = true;
8760       break;
8761     default:
8762       break;
8763     }
8764   }
8765   VPValue *BlockInMask = nullptr;
8766   if (!IsPredicated) {
8767     // Finalize the recipe for Instr, first if it is not predicated.
8768     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8769   } else {
8770     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8771     // Instructions marked for predication are replicated and a mask operand is
8772     // added initially. Masked replicate recipes will later be placed under an
8773     // if-then construct to prevent side-effects. Generate recipes to compute
8774     // the block mask for this region.
8775     BlockInMask = getBlockInMask(I->getParent());
8776   }
8777 
8778   // Note that there is some custom logic to mark some intrinsics as uniform
8779   // manually above for scalable vectors, which this assert needs to account for
8780   // as well.
8781   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8782           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8783          "Should not predicate a uniform recipe");
8784   auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8785                                        IsUniform, BlockInMask);
8786   return Recipe;
8787 }
8788 
8789 /// Find all possible partial reductions in the loop and track all of those that
8790 /// are valid so recipes can be formed later.
8791 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8792   // Find all possible partial reductions.
8793   SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
8794       PartialReductionChains;
8795   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
8796     if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8797             getScaledReduction(Phi, RdxDesc, Range))
8798       PartialReductionChains.push_back(*Pair);
8799 
8800   // A partial reduction is invalid if any of its extends are used by
8801   // something that isn't another partial reduction. This is because the
8802   // extends are intended to be lowered along with the reduction itself.
8803 
8804   // Build up a set of partial reduction bin ops for efficient use checking.
8805   SmallSet<User *, 4> PartialReductionBinOps;
8806   for (const auto &[PartialRdx, _] : PartialReductionChains)
8807     PartialReductionBinOps.insert(PartialRdx.BinOp);
8808 
8809   auto ExtendIsOnlyUsedByPartialReductions =
8810       [&PartialReductionBinOps](Instruction *Extend) {
8811         return all_of(Extend->users(), [&](const User *U) {
8812           return PartialReductionBinOps.contains(U);
8813         });
8814       };
8815 
8816   // Check if each use of a chain's two extends is a partial reduction
8817   // and only add those that don't have non-partial reduction users.
8818   for (auto Pair : PartialReductionChains) {
8819     PartialReductionChain Chain = Pair.first;
8820     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8821         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8822       ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
8823   }
8824 }
8825 
8826 std::optional<std::pair<PartialReductionChain, unsigned>>
8827 VPRecipeBuilder::getScaledReduction(PHINode *PHI,
8828                                     const RecurrenceDescriptor &Rdx,
8829                                     VFRange &Range) {
8830   // TODO: Allow scaling reductions when predicating. The select at
8831   // the end of the loop chooses between the phi value and most recent
8832   // reduction result, both of which have different VFs to the active lane
8833   // mask when scaling.
8834   if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
8835     return std::nullopt;
8836 
8837   auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
8838   if (!Update)
8839     return std::nullopt;
8840 
8841   Value *Op = Update->getOperand(0);
8842   Value *PhiOp = Update->getOperand(1);
8843   if (Op == PHI) {
8844     Op = Update->getOperand(1);
8845     PhiOp = Update->getOperand(0);
8846   }
8847   if (PhiOp != PHI)
8848     return std::nullopt;
8849 
8850   auto *BinOp = dyn_cast<BinaryOperator>(Op);
8851   if (!BinOp || !BinOp->hasOneUse())
8852     return std::nullopt;
8853 
8854   using namespace llvm::PatternMatch;
8855   Value *A, *B;
8856   if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8857       !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8858     return std::nullopt;
8859 
8860   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8861   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8862 
8863   TTI::PartialReductionExtendKind OpAExtend =
8864       TargetTransformInfo::getPartialReductionExtendKind(ExtA);
8865   TTI::PartialReductionExtendKind OpBExtend =
8866       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
8867 
8868   PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
8869 
8870   unsigned TargetScaleFactor =
8871       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8872           A->getType()->getPrimitiveSizeInBits());
8873 
8874   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8875           [&](ElementCount VF) {
8876             InstructionCost Cost = TTI->getPartialReductionCost(
8877                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8878                 VF, OpAExtend, OpBExtend,
8879                 std::make_optional(BinOp->getOpcode()));
8880             return Cost.isValid();
8881           },
8882           Range))
8883     return std::make_pair(Chain, TargetScaleFactor);
8884 
8885   return std::nullopt;
8886 }
8887 
8888 VPRecipeBase *
8889 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8890                                         ArrayRef<VPValue *> Operands,
8891                                         VFRange &Range, VPBasicBlock *VPBB) {
8892   // First, check for specific widening recipes that deal with inductions, Phi
8893   // nodes, calls and memory operations.
8894   VPRecipeBase *Recipe;
8895   if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8896     if (Phi->getParent() != OrigLoop->getHeader())
8897       return tryToBlend(Phi, Operands);
8898 
8899     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8900       return Recipe;
8901 
8902     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8903     assert((Legal->isReductionVariable(Phi) ||
8904             Legal->isFixedOrderRecurrence(Phi)) &&
8905            "can only widen reductions and fixed-order recurrences here");
8906     VPValue *StartV = Operands[0];
8907     if (Legal->isReductionVariable(Phi)) {
8908       const RecurrenceDescriptor &RdxDesc =
8909           Legal->getReductionVars().find(Phi)->second;
8910       assert(RdxDesc.getRecurrenceStartValue() ==
8911              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8912 
8913       // If the PHI is used by a partial reduction, set the scale factor.
8914       std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8915           getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
8916       unsigned ScaleFactor = Pair ? Pair->second : 1;
8917       PhiRecipe = new VPReductionPHIRecipe(
8918           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8919           CM.useOrderedReductions(RdxDesc), ScaleFactor);
8920     } else {
8921       // TODO: Currently fixed-order recurrences are modeled as chains of
8922       // first-order recurrences. If there are no users of the intermediate
8923       // recurrences in the chain, the fixed order recurrence should be modeled
8924       // directly, enabling more efficient codegen.
8925       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8926     }
8927 
8928     PhisToFix.push_back(PhiRecipe);
8929     return PhiRecipe;
8930   }
8931 
8932   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8933                                     cast<TruncInst>(Instr), Operands, Range)))
8934     return Recipe;
8935 
8936   // All widen recipes below deal only with VF > 1.
8937   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8938           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8939     return nullptr;
8940 
8941   if (auto *CI = dyn_cast<CallInst>(Instr))
8942     return tryToWidenCall(CI, Operands, Range);
8943 
8944   if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8945     if (auto HistInfo = Legal->getHistogramInfo(SI))
8946       return tryToWidenHistogram(*HistInfo, Operands);
8947 
8948   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8949     return tryToWidenMemory(Instr, Operands, Range);
8950 
8951   if (getScaledReductionForInstr(Instr))
8952     return tryToCreatePartialReduction(Instr, Operands);
8953 
8954   if (!shouldWiden(Instr, Range))
8955     return nullptr;
8956 
8957   if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8958     return new VPWidenGEPRecipe(GEP,
8959                                 make_range(Operands.begin(), Operands.end()));
8960 
8961   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8962     return new VPWidenSelectRecipe(
8963         *SI, make_range(Operands.begin(), Operands.end()));
8964   }
8965 
8966   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8967     return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8968                                  *CI);
8969   }
8970 
8971   return tryToWiden(Instr, Operands, VPBB);
8972 }
8973 
8974 VPRecipeBase *
8975 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8976                                              ArrayRef<VPValue *> Operands) {
8977   assert(Operands.size() == 2 &&
8978          "Unexpected number of operands for partial reduction");
8979 
8980   VPValue *BinOp = Operands[0];
8981   VPValue *Phi = Operands[1];
8982   if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
8983     std::swap(BinOp, Phi);
8984 
8985   return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
8986                                       Reduction);
8987 }
8988 
8989 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8990                                                         ElementCount MaxVF) {
8991   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8992 
8993   auto MaxVFTimes2 = MaxVF * 2;
8994   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8995     VFRange SubRange = {VF, MaxVFTimes2};
8996     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8997       // Now optimize the initial VPlan.
8998       if (!Plan->hasVF(ElementCount::getFixed(1)))
8999         VPlanTransforms::truncateToMinimalBitwidths(*Plan,
9000                                                     CM.getMinimalBitwidths());
9001       VPlanTransforms::optimize(*Plan);
9002       // TODO: try to put it close to addActiveLaneMask().
9003       // Discard the plan if it is not EVL-compatible
9004       if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
9005                                       *Plan, CM.getMaxSafeElements()))
9006         break;
9007       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9008       VPlans.push_back(std::move(Plan));
9009     }
9010     VF = SubRange.End;
9011   }
9012 }
9013 
9014 // Add the necessary canonical IV and branch recipes required to control the
9015 // loop.
9016 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
9017                                   DebugLoc DL) {
9018   Value *StartIdx = ConstantInt::get(IdxTy, 0);
9019   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
9020 
9021   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
9022   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
9023   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
9024   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
9025   Header->insert(CanonicalIVPHI, Header->begin());
9026 
9027   VPBuilder Builder(TopRegion->getExitingBasicBlock());
9028   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
9029   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
9030       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
9031       "index.next");
9032   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
9033 
9034   // Add the BranchOnCount VPInstruction to the latch.
9035   Builder.createNaryOp(VPInstruction::BranchOnCount,
9036                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
9037 }
9038 
9039 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
9040 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
9041 /// the end value of the induction.
9042 static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
9043                                                VPBuilder &VectorPHBuilder,
9044                                                VPBuilder &ScalarPHBuilder,
9045                                                VPTypeAnalysis &TypeInfo,
9046                                                VPValue *VectorTC) {
9047   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
9048   // Truncated wide inductions resume from the last lane of their vector value
9049   // in the last vector iteration which is handled elsewhere.
9050   if (WideIntOrFp && WideIntOrFp->getTruncInst())
9051     return nullptr;
9052 
9053   VPValue *Start = WideIV->getStartValue();
9054   VPValue *Step = WideIV->getStepValue();
9055   const InductionDescriptor &ID = WideIV->getInductionDescriptor();
9056   VPValue *EndValue = VectorTC;
9057   if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
9058     EndValue = VectorPHBuilder.createDerivedIV(
9059         ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
9060         Start, VectorTC, Step);
9061   }
9062 
9063   // EndValue is derived from the vector trip count (which has the same type as
9064   // the widest induction) and thus may be wider than the induction here.
9065   Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
9066   if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
9067     EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
9068                                                 ScalarTypeOfWideIV,
9069                                                 WideIV->getDebugLoc());
9070   }
9071 
9072   auto *ResumePhiRecipe =
9073       ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
9074                                    WideIV->getDebugLoc(), "bc.resume.val");
9075   return ResumePhiRecipe;
9076 }
9077 
9078 /// Create resume phis in the scalar preheader for first-order recurrences,
9079 /// reductions and inductions, and update the VPIRInstructions wrapping the
9080 /// original phis in the scalar header.
9081 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
9082   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9083   auto *ScalarPH = Plan.getScalarPreheader();
9084   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
9085   VPBuilder VectorPHBuilder(
9086       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
9087   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9088   VPBuilder ScalarPHBuilder(ScalarPH);
9089   VPValue *OneVPV = Plan.getOrAddLiveIn(
9090       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
9091   for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
9092     auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9093     auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9094     if (!ScalarPhiI)
9095       break;
9096 
9097     auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9098     if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9099       if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
9100               WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9101               &Plan.getVectorTripCount())) {
9102         ScalarPhiIRI->addOperand(ResumePhi);
9103         continue;
9104       }
9105       // TODO: Also handle truncated inductions here. Computing end-values
9106       // separately should be done as VPlan-to-VPlan optimization, after
9107       // legalizing all resume values to use the last lane from the loop.
9108       assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9109              "should only skip truncated wide inductions");
9110       continue;
9111     }
9112 
9113     // The backedge value provides the value to resume coming out of a loop,
9114     // which for FORs is a vector whose last element needs to be extracted. The
9115     // start value provides the value if the loop is bypassed.
9116     bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9117     auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9118     if (IsFOR)
9119       ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9120           VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9121           "vector.recur.extract");
9122     StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9123     auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9124         VPInstruction::ResumePhi,
9125         {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9126     ScalarPhiIRI->addOperand(ResumePhiR);
9127   }
9128 }
9129 
9130 /// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
9131 /// either an untruncated wide induction, or if it increments a wide induction
9132 /// by its step.
9133 static bool isOptimizableIVOrUse(VPValue *VPV) {
9134   VPRecipeBase *Def = VPV->getDefiningRecipe();
9135   if (!Def)
9136     return false;
9137   auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
9138   if (WideIV) {
9139     // VPV itself is a wide induction, separately compute the end value for exit
9140     // users if it is not a truncated IV.
9141     return isa<VPWidenPointerInductionRecipe>(WideIV) ||
9142            !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
9143   }
9144 
9145   // Check if VPV is an optimizable induction increment.
9146   if (Def->getNumOperands() != 2)
9147     return false;
9148   WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
9149   if (!WideIV)
9150     WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
9151   if (!WideIV)
9152     return false;
9153 
9154   using namespace VPlanPatternMatch;
9155   auto &ID = WideIV->getInductionDescriptor();
9156 
9157   // Check if VPV increments the induction by the induction step.
9158   VPValue *IVStep = WideIV->getStepValue();
9159   switch (ID.getInductionOpcode()) {
9160   case Instruction::Add:
9161     return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
9162                                                    m_Specific(IVStep)));
9163   case Instruction::FAdd:
9164     return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
9165                                                     m_Specific(IVStep)));
9166   case Instruction::FSub:
9167     return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
9168                                                   m_Specific(IVStep)));
9169   case Instruction::Sub: {
9170     // IVStep will be the negated step of the subtraction. Check if Step == -1 *
9171     // IVStep.
9172     VPValue *Step;
9173     if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
9174         !Step->isLiveIn() || !IVStep->isLiveIn())
9175       return false;
9176     auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
9177     auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
9178     return StepCI && IVStepCI &&
9179            StepCI->getValue() == (-1 * IVStepCI->getValue());
9180   }
9181   default:
9182     return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
9183            match(VPV, m_GetElementPtr(m_Specific(WideIV),
9184                                       m_Specific(WideIV->getStepValue())));
9185   }
9186   llvm_unreachable("should have been covered by switch above");
9187 }
9188 
9189 // Collect VPIRInstructions for phis in the exit blocks that are modeled
9190 // in VPlan and add the exiting VPValue as operand. Some exiting values are not
9191 // modeled explicitly yet and won't be included. Those are un-truncated
9192 // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
9193 // increments.
9194 static SetVector<VPIRInstruction *>
9195 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
9196                          VPlan &Plan) {
9197   auto *MiddleVPBB = Plan.getMiddleBlock();
9198   SetVector<VPIRInstruction *> ExitUsersToFix;
9199   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9200     for (VPRecipeBase &R : *ExitVPBB) {
9201       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9202       if (!ExitIRI)
9203         continue;
9204       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9205       if (!ExitPhi)
9206         break;
9207       for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
9208         BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9209         if (PredVPBB != MiddleVPBB) {
9210           SmallVector<BasicBlock *> ExitingBlocks;
9211           OrigLoop->getExitingBlocks(ExitingBlocks);
9212           assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
9213           ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
9214                                                     : ExitingBlocks[0];
9215         }
9216         Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9217         VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9218         // Exit values for inductions are computed and updated outside of VPlan
9219         // and independent of induction recipes.
9220         // TODO: Compute induction exit values in VPlan.
9221         if (isOptimizableIVOrUse(V) &&
9222             ExitVPBB->getSinglePredecessor() == MiddleVPBB)
9223           continue;
9224         ExitUsersToFix.insert(ExitIRI);
9225         ExitIRI->addOperand(V);
9226       }
9227     }
9228   }
9229   return ExitUsersToFix;
9230 }
9231 
9232 // Add exit values to \p Plan. Extracts are added for each entry in \p
9233 // ExitUsersToFix if needed and their operands are updated. Returns true if all
9234 // exit users can be handled, otherwise return false.
9235 static bool
9236 addUsersInExitBlocks(VPlan &Plan,
9237                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9238   if (ExitUsersToFix.empty())
9239     return true;
9240 
9241   auto *MiddleVPBB = Plan.getMiddleBlock();
9242   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9243 
9244   // Introduce extract for exiting values and update the VPIRInstructions
9245   // modeling the corresponding LCSSA phis.
9246   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9247     for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
9248       // Pass live-in values used by exit phis directly through to their users
9249       // in the exit block.
9250       if (Op->isLiveIn())
9251         continue;
9252 
9253       // Currently only live-ins can be used by exit values from blocks not
9254       // exiting via the vector latch through to the middle block.
9255       if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9256         return false;
9257 
9258       LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
9259       VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
9260                                     {Op, Plan.getOrAddLiveIn(ConstantInt::get(
9261                                              IntegerType::get(Ctx, 32), 1))});
9262       ExitIRI->setOperand(Idx, Ext);
9263     }
9264   }
9265   return true;
9266 }
9267 
9268 /// Handle users in the exit block for first order reductions in the original
9269 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9270 /// users in the original exit block using the VPIRInstruction wrapping to the
9271 /// LCSSA phi.
9272 static void addExitUsersForFirstOrderRecurrences(
9273     VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9274   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9275   auto *ScalarPHVPBB = Plan.getScalarPreheader();
9276   auto *MiddleVPBB = Plan.getMiddleBlock();
9277   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9278   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9279   VPValue *TwoVPV = Plan.getOrAddLiveIn(
9280       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9281 
9282   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9283     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9284     if (!FOR)
9285       continue;
9286 
9287     // This is the second phase of vectorizing first-order recurrences, creating
9288     // extract for users outside the loop. An overview of the transformation is
9289     // described below. Suppose we have the following loop with some use after
9290     // the loop of the last a[i-1],
9291     //
9292     //   for (int i = 0; i < n; ++i) {
9293     //     t = a[i - 1];
9294     //     b[i] = a[i] - t;
9295     //   }
9296     //   use t;
9297     //
9298     // There is a first-order recurrence on "a". For this loop, the shorthand
9299     // scalar IR looks like:
9300     //
9301     //   scalar.ph:
9302     //     s.init = a[-1]
9303     //     br scalar.body
9304     //
9305     //   scalar.body:
9306     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9307     //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9308     //     s2 = a[i]
9309     //     b[i] = s2 - s1
9310     //     br cond, scalar.body, exit.block
9311     //
9312     //   exit.block:
9313     //     use = lcssa.phi [s1, scalar.body]
9314     //
9315     // In this example, s1 is a recurrence because it's value depends on the
9316     // previous iteration. In the first phase of vectorization, we created a
9317     // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9318     // for users in the scalar preheader and exit block.
9319     //
9320     //   vector.ph:
9321     //     v_init = vector(..., ..., ..., a[-1])
9322     //     br vector.body
9323     //
9324     //   vector.body
9325     //     i = phi [0, vector.ph], [i+4, vector.body]
9326     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
9327     //     v2 = a[i, i+1, i+2, i+3]
9328     //     b[i] = v2 - v1
9329     //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9330     //     b[i, i+1, i+2, i+3] = v2 - v1
9331     //     br cond, vector.body, middle.block
9332     //
9333     //   middle.block:
9334     //     vector.recur.extract.for.phi = v2(2)
9335     //     vector.recur.extract = v2(3)
9336     //     br cond, scalar.ph, exit.block
9337     //
9338     //   scalar.ph:
9339     //     scalar.recur.init = phi [vector.recur.extract, middle.block],
9340     //                             [s.init, otherwise]
9341     //     br scalar.body
9342     //
9343     //   scalar.body:
9344     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9345     //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9346     //     s2 = a[i]
9347     //     b[i] = s2 - s1
9348     //     br cond, scalar.body, exit.block
9349     //
9350     //   exit.block:
9351     //     lo = lcssa.phi [s1, scalar.body],
9352     //                    [vector.recur.extract.for.phi, middle.block]
9353     //
9354     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9355     // Extract the penultimate value of the recurrence and use it as operand for
9356     // the VPIRInstruction modeling the phi.
9357     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9358       if (ExitIRI->getOperand(0) != FOR)
9359         continue;
9360       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9361           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9362           "vector.recur.extract.for.phi");
9363       ExitIRI->setOperand(0, PenultimateElement);
9364       ExitUsersToFix.remove(ExitIRI);
9365     }
9366   }
9367 }
9368 
9369 VPlanPtr
9370 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9371 
9372   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9373 
9374   // ---------------------------------------------------------------------------
9375   // Build initial VPlan: Scan the body of the loop in a topological order to
9376   // visit each basic block after having visited its predecessor basic blocks.
9377   // ---------------------------------------------------------------------------
9378 
9379   // Create initial VPlan skeleton, having a basic block for the pre-header
9380   // which contains SCEV expansions that need to happen before the CFG is
9381   // modified; a basic block for the vector pre-header, followed by a region for
9382   // the vector loop, followed by the middle basic block. The skeleton vector
9383   // loop region contains a header and latch basic blocks.
9384 
9385   bool RequiresScalarEpilogueCheck =
9386       LoopVectorizationPlanner::getDecisionAndClampRange(
9387           [this](ElementCount VF) {
9388             return !CM.requiresScalarEpilogue(VF.isVector());
9389           },
9390           Range);
9391   VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(),
9392                                             PSE, RequiresScalarEpilogueCheck,
9393                                             CM.foldTailByMasking(), OrigLoop);
9394 
9395   // Don't use getDecisionAndClampRange here, because we don't know the UF
9396   // so this function is better to be conservative, rather than to split
9397   // it up into different VPlans.
9398   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9399   bool IVUpdateMayOverflow = false;
9400   for (ElementCount VF : Range)
9401     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9402 
9403   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9404   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9405   // Use NUW for the induction increment if we proved that it won't overflow in
9406   // the vector loop or when not folding the tail. In the later case, we know
9407   // that the canonical induction increment will not overflow as the vector trip
9408   // count is >= increment and a multiple of the increment.
9409   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9410   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9411 
9412   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9413                                 Builder);
9414 
9415   // ---------------------------------------------------------------------------
9416   // Pre-construction: record ingredients whose recipes we'll need to further
9417   // process after constructing the initial VPlan.
9418   // ---------------------------------------------------------------------------
9419 
9420   // For each interleave group which is relevant for this (possibly trimmed)
9421   // Range, add it to the set of groups to be later applied to the VPlan and add
9422   // placeholders for its members' Recipes which we'll be replacing with a
9423   // single VPInterleaveRecipe.
9424   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9425     auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9426       bool Result = (VF.isVector() && // Query is illegal for VF == 1
9427                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
9428                          LoopVectorizationCostModel::CM_Interleave);
9429       // For scalable vectors, the only interleave factor currently supported
9430       // is 2 since we require the (de)interleave2 intrinsics instead of
9431       // shufflevectors.
9432       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9433              "Unsupported interleave factor for scalable vectors");
9434       return Result;
9435     };
9436     if (!getDecisionAndClampRange(ApplyIG, Range))
9437       continue;
9438     InterleaveGroups.insert(IG);
9439   }
9440 
9441   // ---------------------------------------------------------------------------
9442   // Construct recipes for the instructions in the loop
9443   // ---------------------------------------------------------------------------
9444 
9445   // Scan the body of the loop in a topological order to visit each basic block
9446   // after having visited its predecessor basic blocks.
9447   LoopBlocksDFS DFS(OrigLoop);
9448   DFS.perform(LI);
9449 
9450   VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9451   VPBasicBlock *VPBB = HeaderVPBB;
9452   BasicBlock *HeaderBB = OrigLoop->getHeader();
9453   bool NeedsMasks =
9454       CM.foldTailByMasking() ||
9455       any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9456         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9457         return Legal->blockNeedsPredication(BB) || NeedsBlends;
9458       });
9459 
9460   RecipeBuilder.collectScaledReductions(Range);
9461 
9462   auto *MiddleVPBB = Plan->getMiddleBlock();
9463   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9464   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9465     // Relevant instructions from basic block BB will be grouped into VPRecipe
9466     // ingredients and fill a new VPBasicBlock.
9467     if (VPBB != HeaderVPBB)
9468       VPBB->setName(BB->getName());
9469     Builder.setInsertPoint(VPBB);
9470 
9471     if (VPBB == HeaderVPBB)
9472       RecipeBuilder.createHeaderMask();
9473     else if (NeedsMasks)
9474       RecipeBuilder.createBlockInMask(BB);
9475 
9476     // Introduce each ingredient into VPlan.
9477     // TODO: Model and preserve debug intrinsics in VPlan.
9478     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9479       Instruction *Instr = &I;
9480       SmallVector<VPValue *, 4> Operands;
9481       auto *Phi = dyn_cast<PHINode>(Instr);
9482       if (Phi && Phi->getParent() == HeaderBB) {
9483         Operands.push_back(Plan->getOrAddLiveIn(
9484             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9485       } else {
9486         auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9487         Operands = {OpRange.begin(), OpRange.end()};
9488       }
9489 
9490       // The stores with invariant address inside the loop will be deleted, and
9491       // in the exit block, a uniform store recipe will be created for the final
9492       // invariant store of the reduction.
9493       StoreInst *SI;
9494       if ((SI = dyn_cast<StoreInst>(&I)) &&
9495           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9496         // Only create recipe for the final invariant store of the reduction.
9497         if (!Legal->isInvariantStoreOfReduction(SI))
9498           continue;
9499         auto *Recipe = new VPReplicateRecipe(
9500             SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9501             true /* IsUniform */);
9502         Recipe->insertBefore(*MiddleVPBB, MBIP);
9503         continue;
9504       }
9505 
9506       VPRecipeBase *Recipe =
9507           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9508       if (!Recipe)
9509         Recipe = RecipeBuilder.handleReplication(Instr, Range);
9510 
9511       RecipeBuilder.setRecipe(Instr, Recipe);
9512       if (isa<VPHeaderPHIRecipe>(Recipe)) {
9513         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9514         // the following cases, VPHeaderPHIRecipes may be created after non-phi
9515         // recipes and need to be moved to the phi section of HeaderVPBB:
9516         // * tail-folding (non-phi recipes computing the header mask are
9517         // introduced earlier than regular header phi recipes, and should appear
9518         // after them)
9519         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9520 
9521         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9522                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9523                "unexpected recipe needs moving");
9524         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9525       } else
9526         VPBB->appendRecipe(Recipe);
9527     }
9528 
9529     VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9530     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9531   }
9532 
9533   // After here, VPBB should not be used.
9534   VPBB = nullptr;
9535 
9536   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9537          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9538          "entry block must be set to a VPRegionBlock having a non-empty entry "
9539          "VPBasicBlock");
9540   RecipeBuilder.fixHeaderPhis();
9541 
9542   // Update wide induction increments to use the same step as the corresponding
9543   // wide induction. This enables detecting induction increments directly in
9544   // VPlan and removes redundant splats.
9545   for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9546     auto *IVInc = cast<Instruction>(
9547         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9548     if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9549       continue;
9550     VPWidenInductionRecipe *WideIV =
9551         cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9552     VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9553     R->setOperand(1, WideIV->getStepValue());
9554   }
9555 
9556   if (auto *UncountableExitingBlock =
9557           Legal->getUncountableEarlyExitingBlock()) {
9558     VPlanTransforms::handleUncountableEarlyExit(
9559         *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9560   }
9561   addScalarResumePhis(RecipeBuilder, *Plan);
9562   SetVector<VPIRInstruction *> ExitUsersToFix =
9563       collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9564   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9565   if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9566     reportVectorizationFailure(
9567         "Some exit values in loop with uncountable exit not supported yet",
9568         "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9569     return nullptr;
9570   }
9571 
9572   // ---------------------------------------------------------------------------
9573   // Transform initial VPlan: Apply previously taken decisions, in order, to
9574   // bring the VPlan to its final state.
9575   // ---------------------------------------------------------------------------
9576 
9577   // Adjust the recipes for any inloop reductions.
9578   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9579 
9580   // Interleave memory: for each Interleave Group we marked earlier as relevant
9581   // for this VPlan, replace the Recipes widening its memory instructions with a
9582   // single VPInterleaveRecipe at its insertion point.
9583   VPlanTransforms::createInterleaveGroups(
9584       *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
9585 
9586   for (ElementCount VF : Range)
9587     Plan->addVF(VF);
9588   Plan->setName("Initial VPlan");
9589 
9590   // Replace VPValues for known constant strides guaranteed by predicate scalar
9591   // evolution.
9592   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9593     auto *R = cast<VPRecipeBase>(&U);
9594     return R->getParent()->getParent() ||
9595            R->getParent() ==
9596                Plan->getVectorLoopRegion()->getSinglePredecessor();
9597   };
9598   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9599     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9600     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9601     // Only handle constant strides for now.
9602     if (!ScevStride)
9603       continue;
9604 
9605     auto *CI = Plan->getOrAddLiveIn(
9606         ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9607     if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9608       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9609 
9610     // The versioned value may not be used in the loop directly but through a
9611     // sext/zext. Add new live-ins in those cases.
9612     for (Value *U : StrideV->users()) {
9613       if (!isa<SExtInst, ZExtInst>(U))
9614         continue;
9615       VPValue *StrideVPV = Plan->getLiveIn(U);
9616       if (!StrideVPV)
9617         continue;
9618       unsigned BW = U->getType()->getScalarSizeInBits();
9619       APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9620                                  : ScevStride->getAPInt().zext(BW);
9621       VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9622       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9623     }
9624   }
9625 
9626   VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
9627     return Legal->blockNeedsPredication(BB);
9628   });
9629 
9630   // Sink users of fixed-order recurrence past the recipe defining the previous
9631   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9632   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
9633     return nullptr;
9634 
9635   if (useActiveLaneMask(Style)) {
9636     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9637     // TailFoldingStyle is visible there.
9638     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9639     bool WithoutRuntimeCheck =
9640         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
9641     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9642                                        WithoutRuntimeCheck);
9643   }
9644 
9645   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9646   return Plan;
9647 }
9648 
9649 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9650   // Outer loop handling: They may require CFG and instruction level
9651   // transformations before even evaluating whether vectorization is profitable.
9652   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9653   // the vectorization pipeline.
9654   assert(!OrigLoop->isInnermost());
9655   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9656 
9657   // Create new empty VPlan
9658   auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9659                                         true, false, OrigLoop);
9660 
9661   // Build hierarchical CFG
9662   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9663   HCFGBuilder.buildHierarchicalCFG();
9664 
9665   for (ElementCount VF : Range)
9666     Plan->addVF(VF);
9667 
9668   VPlanTransforms::VPInstructionsToVPRecipes(
9669       Plan,
9670       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9671       *PSE.getSE(), *TLI);
9672 
9673   // Remove the existing terminator of the exiting block of the top-most region.
9674   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9675   auto *Term =
9676       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9677   Term->eraseFromParent();
9678 
9679   // Tail folding is not supported for outer loops, so the induction increment
9680   // is guaranteed to not wrap.
9681   bool HasNUW = true;
9682   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9683                         DebugLoc());
9684 
9685   // Collect mapping of IR header phis to header phi recipes, to be used in
9686   // addScalarResumePhis.
9687   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9688                                 Builder);
9689   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9690     if (isa<VPCanonicalIVPHIRecipe>(&R))
9691       continue;
9692     auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9693     RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9694   }
9695   addScalarResumePhis(RecipeBuilder, *Plan);
9696 
9697   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9698   return Plan;
9699 }
9700 
9701 // Adjust the recipes for reductions. For in-loop reductions the chain of
9702 // instructions leading from the loop exit instr to the phi need to be converted
9703 // to reductions, with one operand being vector and the other being the scalar
9704 // reduction chain. For other reductions, a select is introduced between the phi
9705 // and users outside the vector region when folding the tail.
9706 //
9707 // A ComputeReductionResult recipe is added to the middle block, also for
9708 // in-loop reductions which compute their result in-loop, because generating
9709 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9710 //
9711 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9712 // with a boolean reduction phi node to check if the condition is true in any
9713 // iteration. The final value is selected by the final ComputeReductionResult.
9714 void LoopVectorizationPlanner::adjustRecipesForReductions(
9715     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9716   using namespace VPlanPatternMatch;
9717   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9718   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9719   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9720   SmallVector<VPRecipeBase *> ToDelete;
9721 
9722   for (VPRecipeBase &R : Header->phis()) {
9723     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9724     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9725       continue;
9726 
9727     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9728     RecurKind Kind = RdxDesc.getRecurrenceKind();
9729     assert(
9730         !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9731         !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
9732         "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9733 
9734     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9735     SetVector<VPSingleDefRecipe *> Worklist;
9736     Worklist.insert(PhiR);
9737     for (unsigned I = 0; I != Worklist.size(); ++I) {
9738       VPSingleDefRecipe *Cur = Worklist[I];
9739       for (VPUser *U : Cur->users()) {
9740         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9741         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9742           assert((UserRecipe->getParent() == MiddleVPBB ||
9743                   UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9744                  "U must be either in the loop region, the middle block or the "
9745                  "scalar preheader.");
9746           continue;
9747         }
9748         Worklist.insert(UserRecipe);
9749       }
9750     }
9751 
9752     // Visit operation "Links" along the reduction chain top-down starting from
9753     // the phi until LoopExitValue. We keep track of the previous item
9754     // (PreviousLink) to tell which of the two operands of a Link will remain
9755     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9756     // the select instructions. Blend recipes of in-loop reduction phi's  will
9757     // get folded to their non-phi operand, as the reduction recipe handles the
9758     // condition directly.
9759     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9760     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9761       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9762 
9763       // Index of the first operand which holds a non-mask vector operand.
9764       unsigned IndexOfFirstOperand;
9765       // Recognize a call to the llvm.fmuladd intrinsic.
9766       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9767       VPValue *VecOp;
9768       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9769       if (IsFMulAdd) {
9770         assert(
9771             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9772             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9773         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9774                 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9775                CurrentLink->getOperand(2) == PreviousLink &&
9776                "expected a call where the previous link is the added operand");
9777 
9778         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9779         // need to create an fmul recipe (multiplying the first two operands of
9780         // the fmuladd together) to use as the vector operand for the fadd
9781         // reduction.
9782         VPInstruction *FMulRecipe = new VPInstruction(
9783             Instruction::FMul,
9784             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9785             CurrentLinkI->getFastMathFlags());
9786         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9787         VecOp = FMulRecipe;
9788       } else {
9789         auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9790         if (PhiR->isInLoop() && Blend) {
9791           assert(Blend->getNumIncomingValues() == 2 &&
9792                  "Blend must have 2 incoming values");
9793           if (Blend->getIncomingValue(0) == PhiR)
9794             Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9795           else {
9796             assert(Blend->getIncomingValue(1) == PhiR &&
9797                    "PhiR must be an operand of the blend");
9798             Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9799           }
9800           continue;
9801         }
9802 
9803         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9804           if (isa<VPWidenRecipe>(CurrentLink)) {
9805             assert(isa<CmpInst>(CurrentLinkI) &&
9806                    "need to have the compare of the select");
9807             continue;
9808           }
9809           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9810                  "must be a select recipe");
9811           IndexOfFirstOperand = 1;
9812         } else {
9813           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9814                  "Expected to replace a VPWidenSC");
9815           IndexOfFirstOperand = 0;
9816         }
9817         // Note that for non-commutable operands (cmp-selects), the semantics of
9818         // the cmp-select are captured in the recurrence kind.
9819         unsigned VecOpId =
9820             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9821                 ? IndexOfFirstOperand + 1
9822                 : IndexOfFirstOperand;
9823         VecOp = CurrentLink->getOperand(VecOpId);
9824         assert(VecOp != PreviousLink &&
9825                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9826                                        (VecOpId - IndexOfFirstOperand)) ==
9827                    PreviousLink &&
9828                "PreviousLink must be the operand other than VecOp");
9829       }
9830 
9831       BasicBlock *BB = CurrentLinkI->getParent();
9832       VPValue *CondOp = nullptr;
9833       if (CM.blockNeedsPredicationForAnyReason(BB))
9834         CondOp = RecipeBuilder.getBlockInMask(BB);
9835 
9836       auto *RedRecipe = new VPReductionRecipe(
9837           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9838           CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9839       // Append the recipe to the end of the VPBasicBlock because we need to
9840       // ensure that it comes after all of it's inputs, including CondOp.
9841       // Delete CurrentLink as it will be invalid if its operand is replaced
9842       // with a reduction defined at the bottom of the block in the next link.
9843       LinkVPBB->appendRecipe(RedRecipe);
9844       CurrentLink->replaceAllUsesWith(RedRecipe);
9845       ToDelete.push_back(CurrentLink);
9846       PreviousLink = RedRecipe;
9847     }
9848   }
9849   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9850   Builder.setInsertPoint(&*LatchVPBB->begin());
9851   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9852   for (VPRecipeBase &R :
9853        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9854     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9855     if (!PhiR)
9856       continue;
9857 
9858     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9859     // If tail is folded by masking, introduce selects between the phi
9860     // and the users outside the vector region of each reduction, at the
9861     // beginning of the dedicated latch block.
9862     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9863     auto *NewExitingVPV = PhiR->getBackedgeValue();
9864     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9865       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9866       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9867              "reduction recipe must be defined before latch");
9868       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9869       std::optional<FastMathFlags> FMFs =
9870           PhiTy->isFloatingPointTy()
9871               ? std::make_optional(RdxDesc.getFastMathFlags())
9872               : std::nullopt;
9873       NewExitingVPV =
9874           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9875       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9876         return isa<VPInstruction>(&U) &&
9877                cast<VPInstruction>(&U)->getOpcode() ==
9878                    VPInstruction::ComputeReductionResult;
9879       });
9880       if (CM.usePredicatedReductionSelect(
9881               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9882         PhiR->setOperand(1, NewExitingVPV);
9883     }
9884 
9885     // If the vector reduction can be performed in a smaller type, we truncate
9886     // then extend the loop exit value to enable InstCombine to evaluate the
9887     // entire expression in the smaller type.
9888     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9889     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9890         !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9891             RdxDesc.getRecurrenceKind())) {
9892       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9893       Type *RdxTy = RdxDesc.getRecurrenceType();
9894       auto *Trunc =
9895           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9896       auto *Extnd =
9897           RdxDesc.isSigned()
9898               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9899               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9900 
9901       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9902       Extnd->insertAfter(Trunc);
9903       if (PhiR->getOperand(1) == NewExitingVPV)
9904         PhiR->setOperand(1, Extnd->getVPSingleValue());
9905       NewExitingVPV = Extnd;
9906     }
9907 
9908     // We want code in the middle block to appear to execute on the location of
9909     // the scalar loop's latch terminator because: (a) it is all compiler
9910     // generated, (b) these instructions are always executed after evaluating
9911     // the latch conditional branch, and (c) other passes may add new
9912     // predecessors which terminate on this line. This is the easiest way to
9913     // ensure we don't accidentally cause an extra step back into the loop while
9914     // debugging.
9915     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9916 
9917     // TODO: At the moment ComputeReductionResult also drives creation of the
9918     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9919     // even for in-loop reductions, until the reduction resume value handling is
9920     // also modeled in VPlan.
9921     auto *FinalReductionResult = new VPInstruction(
9922         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9923     // Update all users outside the vector region.
9924     OrigExitingVPV->replaceUsesWithIf(
9925         FinalReductionResult, [](VPUser &User, unsigned) {
9926           auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9927           return Parent && !Parent->getParent();
9928         });
9929     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9930 
9931     // Adjust AnyOf reductions; replace the reduction phi for the selected value
9932     // with a boolean reduction phi node to check if the condition is true in
9933     // any iteration. The final value is selected by the final
9934     // ComputeReductionResult.
9935     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9936             RdxDesc.getRecurrenceKind())) {
9937       auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9938         return isa<VPWidenSelectRecipe>(U) ||
9939                (isa<VPReplicateRecipe>(U) &&
9940                 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9941                     Instruction::Select);
9942       }));
9943       VPValue *Cmp = Select->getOperand(0);
9944       // If the compare is checking the reduction PHI node, adjust it to check
9945       // the start value.
9946       if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9947         for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9948           if (CmpR->getOperand(I) == PhiR)
9949             CmpR->setOperand(I, PhiR->getStartValue());
9950       }
9951       VPBuilder::InsertPointGuard Guard(Builder);
9952       Builder.setInsertPoint(Select);
9953 
9954       // If the true value of the select is the reduction phi, the new value is
9955       // selected if the negated condition is true in any iteration.
9956       if (Select->getOperand(1) == PhiR)
9957         Cmp = Builder.createNot(Cmp);
9958       VPValue *Or = Builder.createOr(PhiR, Cmp);
9959       Select->getVPSingleValue()->replaceAllUsesWith(Or);
9960       // Delete Select now that it has invalid types.
9961       ToDelete.push_back(Select);
9962 
9963       // Convert the reduction phi to operate on bools.
9964       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9965                               OrigLoop->getHeader()->getContext())));
9966       continue;
9967     }
9968 
9969     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
9970             RdxDesc.getRecurrenceKind())) {
9971       // Adjust the start value for FindLastIV recurrences to use the sentinel
9972       // value after generating the ResumePhi recipe, which uses the original
9973       // start value.
9974       PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9975     }
9976   }
9977 
9978   VPlanTransforms::clearReductionWrapFlags(*Plan);
9979   for (VPRecipeBase *R : ToDelete)
9980     R->eraseFromParent();
9981 }
9982 
9983 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9984   assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9985 
9986   // Fast-math-flags propagate from the original induction instruction.
9987   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9988   if (FPBinOp)
9989     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9990 
9991   Value *Step = State.get(getStepValue(), VPLane(0));
9992   Value *Index = State.get(getOperand(1), VPLane(0));
9993   Value *DerivedIV = emitTransformedIndex(
9994       State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9995       cast_if_present<BinaryOperator>(FPBinOp));
9996   DerivedIV->setName(Name);
9997   // If index is the vector trip count, the concrete value will only be set in
9998   // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9999   // TODO: Remove the special case for the vector trip count once it is computed
10000   // in VPlan and can be used during VPlan simplification.
10001   assert((DerivedIV != Index ||
10002           getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
10003          "IV didn't need transforming?");
10004   State.set(this, DerivedIV, VPLane(0));
10005 }
10006 
10007 void VPReplicateRecipe::execute(VPTransformState &State) {
10008   Instruction *UI = getUnderlyingInstr();
10009   if (State.Lane) { // Generate a single instance.
10010     assert((State.VF.isScalar() || !isUniform()) &&
10011            "uniform recipe shouldn't be predicated");
10012     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
10013     State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
10014     // Insert scalar instance packing it into a vector.
10015     if (State.VF.isVector() && shouldPack()) {
10016       // If we're constructing lane 0, initialize to start from poison.
10017       if (State.Lane->isFirstLane()) {
10018         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
10019         Value *Poison = PoisonValue::get(
10020             VectorType::get(UI->getType(), State.VF));
10021         State.set(this, Poison);
10022       }
10023       State.packScalarIntoVectorValue(this, *State.Lane);
10024     }
10025     return;
10026   }
10027 
10028   if (IsUniform) {
10029     // Uniform within VL means we need to generate lane 0.
10030     State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
10031     return;
10032   }
10033 
10034   // A store of a loop varying value to a uniform address only needs the last
10035   // copy of the store.
10036   if (isa<StoreInst>(UI) &&
10037       vputils::isUniformAfterVectorization(getOperand(1))) {
10038     auto Lane = VPLane::getLastLaneForVF(State.VF);
10039     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10040     return;
10041   }
10042 
10043   // Generate scalar instances for all VF lanes.
10044   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
10045   const unsigned EndLane = State.VF.getKnownMinValue();
10046   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
10047     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10048 }
10049 
10050 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10051 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10052 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10053 // for predication.
10054 static ScalarEpilogueLowering getScalarEpilogueLowering(
10055     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10056     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10057     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
10058   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10059   // don't look at hints or options, and don't request a scalar epilogue.
10060   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10061   // LoopAccessInfo (due to code dependency and not being able to reliably get
10062   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10063   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10064   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10065   // back to the old way and vectorize with versioning when forced. See D81345.)
10066   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10067                                                       PGSOQueryType::IRPass) &&
10068                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10069     return CM_ScalarEpilogueNotAllowedOptSize;
10070 
10071   // 2) If set, obey the directives
10072   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10073     switch (PreferPredicateOverEpilogue) {
10074     case PreferPredicateTy::ScalarEpilogue:
10075       return CM_ScalarEpilogueAllowed;
10076     case PreferPredicateTy::PredicateElseScalarEpilogue:
10077       return CM_ScalarEpilogueNotNeededUsePredicate;
10078     case PreferPredicateTy::PredicateOrDontVectorize:
10079       return CM_ScalarEpilogueNotAllowedUsePredicate;
10080     };
10081   }
10082 
10083   // 3) If set, obey the hints
10084   switch (Hints.getPredicate()) {
10085   case LoopVectorizeHints::FK_Enabled:
10086     return CM_ScalarEpilogueNotNeededUsePredicate;
10087   case LoopVectorizeHints::FK_Disabled:
10088     return CM_ScalarEpilogueAllowed;
10089   };
10090 
10091   // 4) if the TTI hook indicates this is profitable, request predication.
10092   TailFoldingInfo TFI(TLI, &LVL, IAI);
10093   if (TTI->preferPredicateOverEpilogue(&TFI))
10094     return CM_ScalarEpilogueNotNeededUsePredicate;
10095 
10096   return CM_ScalarEpilogueAllowed;
10097 }
10098 
10099 // Process the loop in the VPlan-native vectorization path. This path builds
10100 // VPlan upfront in the vectorization pipeline, which allows to apply
10101 // VPlan-to-VPlan transformations from the very beginning without modifying the
10102 // input LLVM IR.
10103 static bool processLoopInVPlanNativePath(
10104     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10105     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10106     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10107     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10108     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10109     LoopVectorizationRequirements &Requirements) {
10110 
10111   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10112     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10113     return false;
10114   }
10115   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10116   Function *F = L->getHeader()->getParent();
10117   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10118 
10119   ScalarEpilogueLowering SEL =
10120       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
10121 
10122   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10123                                 &Hints, IAI);
10124   // Use the planner for outer loop vectorization.
10125   // TODO: CM is not used at this point inside the planner. Turn CM into an
10126   // optional argument if we don't need it in the future.
10127   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
10128                                ORE);
10129 
10130   // Get user vectorization factor.
10131   ElementCount UserVF = Hints.getWidth();
10132 
10133   CM.collectElementTypesForWidening();
10134 
10135   // Plan how to best vectorize, return the best VF and its cost.
10136   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10137 
10138   // If we are stress testing VPlan builds, do not attempt to generate vector
10139   // code. Masked vector code generation support will follow soon.
10140   // Also, do not attempt to vectorize if no vector code will be produced.
10141   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
10142     return false;
10143 
10144   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10145 
10146   {
10147     bool AddBranchWeights =
10148         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10149     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10150                              AddBranchWeights);
10151     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10152                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10153     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10154                       << L->getHeader()->getParent()->getName() << "\"\n");
10155     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10156   }
10157 
10158   reportVectorization(ORE, L, VF, 1);
10159 
10160   // Mark the loop as already vectorized to avoid vectorizing again.
10161   Hints.setAlreadyVectorized();
10162   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10163   return true;
10164 }
10165 
10166 // Emit a remark if there are stores to floats that required a floating point
10167 // extension. If the vectorized loop was generated with floating point there
10168 // will be a performance penalty from the conversion overhead and the change in
10169 // the vector width.
10170 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10171   SmallVector<Instruction *, 4> Worklist;
10172   for (BasicBlock *BB : L->getBlocks()) {
10173     for (Instruction &Inst : *BB) {
10174       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10175         if (S->getValueOperand()->getType()->isFloatTy())
10176           Worklist.push_back(S);
10177       }
10178     }
10179   }
10180 
10181   // Traverse the floating point stores upwards searching, for floating point
10182   // conversions.
10183   SmallPtrSet<const Instruction *, 4> Visited;
10184   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10185   while (!Worklist.empty()) {
10186     auto *I = Worklist.pop_back_val();
10187     if (!L->contains(I))
10188       continue;
10189     if (!Visited.insert(I).second)
10190       continue;
10191 
10192     // Emit a remark if the floating point store required a floating
10193     // point conversion.
10194     // TODO: More work could be done to identify the root cause such as a
10195     // constant or a function return type and point the user to it.
10196     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10197       ORE->emit([&]() {
10198         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10199                                           I->getDebugLoc(), L->getHeader())
10200                << "floating point conversion changes vector width. "
10201                << "Mixed floating point precision requires an up/down "
10202                << "cast that will negatively impact performance.";
10203       });
10204 
10205     for (Use &Op : I->operands())
10206       if (auto *OpI = dyn_cast<Instruction>(Op))
10207         Worklist.push_back(OpI);
10208   }
10209 }
10210 
10211 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10212                                        VectorizationFactor &VF, Loop *L,
10213                                        const TargetTransformInfo &TTI,
10214                                        PredicatedScalarEvolution &PSE,
10215                                        ScalarEpilogueLowering SEL) {
10216   InstructionCost CheckCost = Checks.getCost();
10217   if (!CheckCost.isValid())
10218     return false;
10219 
10220   // When interleaving only scalar and vector cost will be equal, which in turn
10221   // would lead to a divide by 0. Fall back to hard threshold.
10222   if (VF.Width.isScalar()) {
10223     if (CheckCost > VectorizeMemoryCheckThreshold) {
10224       LLVM_DEBUG(
10225           dbgs()
10226           << "LV: Interleaving only is not profitable due to runtime checks\n");
10227       return false;
10228     }
10229     return true;
10230   }
10231 
10232   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10233   uint64_t ScalarC = *VF.ScalarCost.getValue();
10234   if (ScalarC == 0)
10235     return true;
10236 
10237   // First, compute the minimum iteration count required so that the vector
10238   // loop outperforms the scalar loop.
10239   //  The total cost of the scalar loop is
10240   //   ScalarC * TC
10241   //  where
10242   //  * TC is the actual trip count of the loop.
10243   //  * ScalarC is the cost of a single scalar iteration.
10244   //
10245   //  The total cost of the vector loop is
10246   //    RtC + VecC * (TC / VF) + EpiC
10247   //  where
10248   //  * RtC is the cost of the generated runtime checks
10249   //  * VecC is the cost of a single vector iteration.
10250   //  * TC is the actual trip count of the loop
10251   //  * VF is the vectorization factor
10252   //  * EpiCost is the cost of the generated epilogue, including the cost
10253   //    of the remaining scalar operations.
10254   //
10255   // Vectorization is profitable once the total vector cost is less than the
10256   // total scalar cost:
10257   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10258   //
10259   // Now we can compute the minimum required trip count TC as
10260   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10261   //
10262   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10263   // the computations are performed on doubles, not integers and the result
10264   // is rounded up, hence we get an upper estimate of the TC.
10265   unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10266   uint64_t RtC = *CheckCost.getValue();
10267   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10268   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10269 
10270   // Second, compute a minimum iteration count so that the cost of the
10271   // runtime checks is only a fraction of the total scalar loop cost. This
10272   // adds a loop-dependent bound on the overhead incurred if the runtime
10273   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10274   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10275   // cost, compute
10276   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10277   uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10278 
10279   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10280   // epilogue is allowed, choose the next closest multiple of VF. This should
10281   // partly compensate for ignoring the epilogue cost.
10282   uint64_t MinTC = std::max(MinTC1, MinTC2);
10283   if (SEL == CM_ScalarEpilogueAllowed)
10284     MinTC = alignTo(MinTC, IntVF);
10285   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
10286 
10287   LLVM_DEBUG(
10288       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10289              << VF.MinProfitableTripCount << "\n");
10290 
10291   // Skip vectorization if the expected trip count is less than the minimum
10292   // required trip count.
10293   if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10294     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10295                                 VF.MinProfitableTripCount)) {
10296       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10297                            "trip count < minimum profitable VF ("
10298                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10299                         << ")\n");
10300 
10301       return false;
10302     }
10303   }
10304   return true;
10305 }
10306 
10307 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10308     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10309                                !EnableLoopInterleaving),
10310       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10311                               !EnableLoopVectorization) {}
10312 
10313 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10314 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10315 /// don't have a corresponding wide induction in \p EpiPlan.
10316 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10317   // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10318   // will need their resume-values computed in the main vector loop. Others
10319   // can be removed from the main VPlan.
10320   SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10321   for (VPRecipeBase &R :
10322        EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
10323     if (isa<VPCanonicalIVPHIRecipe>(&R))
10324       continue;
10325     EpiWidenedPhis.insert(
10326         cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10327   }
10328   for (VPRecipeBase &R : make_early_inc_range(
10329            *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10330     auto *VPIRInst = cast<VPIRInstruction>(&R);
10331     auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10332     if (!IRI)
10333       break;
10334     if (EpiWidenedPhis.contains(IRI))
10335       continue;
10336     // There is no corresponding wide induction in the epilogue plan that would
10337     // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10338     // together with the corresponding ResumePhi. The resume values for the
10339     // scalar loop will be created during execution of EpiPlan.
10340     VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10341     VPIRInst->eraseFromParent();
10342     ResumePhi->eraseFromParent();
10343   }
10344   VPlanTransforms::removeDeadRecipes(MainPlan);
10345 
10346   using namespace VPlanPatternMatch;
10347   VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10348   VPValue *VectorTC = &MainPlan.getVectorTripCount();
10349   // If there is a suitable resume value for the canonical induction in the
10350   // scalar (which will become vector) epilogue loop we are done. Otherwise
10351   // create it below.
10352   if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10353         return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10354                              m_Specific(VectorTC), m_SpecificInt(0)));
10355       }))
10356     return;
10357   VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10358   ScalarPHBuilder.createNaryOp(
10359       VPInstruction::ResumePhi,
10360       {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10361       "vec.epilog.resume.val");
10362 }
10363 
10364 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10365 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10366 static void
10367 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
10368                                  const SCEV2ValueTy &ExpandedSCEVs,
10369                                  const EpilogueLoopVectorizationInfo &EPI) {
10370   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10371   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10372   Header->setName("vec.epilog.vector.body");
10373 
10374   // Re-use the trip count and steps expanded for the main loop, as
10375   // skeleton creation needs it as a value that dominates both the scalar
10376   // and vector epilogue loops
10377   // TODO: This is a workaround needed for epilogue vectorization and it
10378   // should be removed once induction resume value creation is done
10379   // directly in VPlan.
10380   for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10381     auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10382     if (!ExpandR)
10383       continue;
10384     auto *ExpandedVal =
10385         Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10386     ExpandR->replaceAllUsesWith(ExpandedVal);
10387     if (Plan.getTripCount() == ExpandR)
10388       Plan.resetTripCount(ExpandedVal);
10389     ExpandR->eraseFromParent();
10390   }
10391 
10392   // Ensure that the start values for all header phi recipes are updated before
10393   // vectorizing the epilogue loop.
10394   for (VPRecipeBase &R : Header->phis()) {
10395     if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10396       // When vectorizing the epilogue loop, the canonical induction start
10397       // value needs to be changed from zero to the value after the main
10398       // vector loop. Find the resume value created during execution of the main
10399       // VPlan.
10400       // FIXME: Improve modeling for canonical IV start values in the epilogue
10401       // loop.
10402       BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10403           predecessors(L->getLoopPreheader()),
10404           [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10405             if (BB != EPI.MainLoopIterationCountCheck &&
10406                 BB != EPI.EpilogueIterationCountCheck &&
10407                 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10408               return BB;
10409             return nullptr;
10410           });
10411       using namespace llvm::PatternMatch;
10412       Type *IdxTy = IV->getScalarType();
10413       PHINode *EPResumeVal = find_singleton<PHINode>(
10414           L->getLoopPreheader()->phis(),
10415           [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10416             if (P.getType() == IdxTy &&
10417                 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10418                 match(
10419                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10420                     m_SpecificInt(0)))
10421               return &P;
10422             return nullptr;
10423           });
10424       assert(EPResumeVal && "must have a resume value for the canonical IV");
10425       VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10426       assert(all_of(IV->users(),
10427                     [](const VPUser *U) {
10428                       return isa<VPScalarIVStepsRecipe>(U) ||
10429                              isa<VPScalarCastRecipe>(U) ||
10430                              isa<VPDerivedIVRecipe>(U) ||
10431                              cast<VPInstruction>(U)->getOpcode() ==
10432                                  Instruction::Add;
10433                     }) &&
10434              "the canonical IV should only be used by its increment or "
10435              "ScalarIVSteps when resetting the start value");
10436       IV->setOperand(0, VPV);
10437       continue;
10438     }
10439 
10440     Value *ResumeV = nullptr;
10441     // TODO: Move setting of resume values to prepareToExecute.
10442     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10443       ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10444                     ->getIncomingValueForBlock(L->getLoopPreheader());
10445       const RecurrenceDescriptor &RdxDesc =
10446           ReductionPhi->getRecurrenceDescriptor();
10447       RecurKind RK = RdxDesc.getRecurrenceKind();
10448       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
10449         // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10450         // start value; compare the final value from the main vector loop
10451         // to the start value.
10452         IRBuilder<> Builder(
10453             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10454         ResumeV =
10455             Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10456       } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
10457         // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10458         // to the resume value. The resume value is adjusted to the sentinel
10459         // value when the final value from the main vector loop equals the start
10460         // value. This ensures correctness when the start value might not be
10461         // less than the minimum value of a monotonically increasing induction
10462         // variable.
10463         IRBuilder<> Builder(
10464             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10465         Value *Cmp =
10466             Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10467         ResumeV =
10468             Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10469       }
10470     } else {
10471       // Retrieve the induction resume values for wide inductions from
10472       // their original phi nodes in the scalar loop.
10473       PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10474       // Hook up to the PHINode generated by a ResumePhi recipe of main
10475       // loop VPlan, which feeds the scalar loop.
10476       ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10477     }
10478     assert(ResumeV && "Must have a resume value");
10479     VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10480     cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10481   }
10482 }
10483 
10484 bool LoopVectorizePass::processLoop(Loop *L) {
10485   assert((EnableVPlanNativePath || L->isInnermost()) &&
10486          "VPlan-native path is not enabled. Only process inner loops.");
10487 
10488   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10489                     << L->getHeader()->getParent()->getName() << "' from "
10490                     << L->getLocStr() << "\n");
10491 
10492   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10493 
10494   LLVM_DEBUG(
10495       dbgs() << "LV: Loop hints:"
10496              << " force="
10497              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10498                      ? "disabled"
10499                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10500                             ? "enabled"
10501                             : "?"))
10502              << " width=" << Hints.getWidth()
10503              << " interleave=" << Hints.getInterleave() << "\n");
10504 
10505   // Function containing loop
10506   Function *F = L->getHeader()->getParent();
10507 
10508   // Looking at the diagnostic output is the only way to determine if a loop
10509   // was vectorized (other than looking at the IR or machine code), so it
10510   // is important to generate an optimization remark for each loop. Most of
10511   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10512   // generated as OptimizationRemark and OptimizationRemarkMissed are
10513   // less verbose reporting vectorized loops and unvectorized loops that may
10514   // benefit from vectorization, respectively.
10515 
10516   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10517     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10518     return false;
10519   }
10520 
10521   PredicatedScalarEvolution PSE(*SE, *L);
10522 
10523   // Check if it is legal to vectorize the loop.
10524   LoopVectorizationRequirements Requirements;
10525   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10526                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10527   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10528     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10529     Hints.emitRemarkWithHints();
10530     return false;
10531   }
10532 
10533   if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10534     reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10535                                "early exit is not enabled",
10536                                "UncountableEarlyExitLoopsDisabled", ORE, L);
10537     return false;
10538   }
10539 
10540   if (LVL.hasStructVectorCall()) {
10541     reportVectorizationFailure("Auto-vectorization of calls that return struct "
10542                                "types is not yet supported",
10543                                "StructCallVectorizationUnsupported", ORE, L);
10544     return false;
10545   }
10546 
10547   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10548   // here. They may require CFG and instruction level transformations before
10549   // even evaluating whether vectorization is profitable. Since we cannot modify
10550   // the incoming IR, we need to build VPlan upfront in the vectorization
10551   // pipeline.
10552   if (!L->isInnermost())
10553     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10554                                         ORE, BFI, PSI, Hints, Requirements);
10555 
10556   assert(L->isInnermost() && "Inner loop expected.");
10557 
10558   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10559   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10560 
10561   // If an override option has been passed in for interleaved accesses, use it.
10562   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10563     UseInterleaved = EnableInterleavedMemAccesses;
10564 
10565   // Analyze interleaved memory accesses.
10566   if (UseInterleaved)
10567     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10568 
10569   if (LVL.hasUncountableEarlyExit()) {
10570     BasicBlock *LoopLatch = L->getLoopLatch();
10571     if (IAI.requiresScalarEpilogue() ||
10572         any_of(LVL.getCountableExitingBlocks(),
10573                [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10574       reportVectorizationFailure("Auto-vectorization of early exit loops "
10575                                  "requiring a scalar epilogue is unsupported",
10576                                  "UncountableEarlyExitUnsupported", ORE, L);
10577       return false;
10578     }
10579   }
10580 
10581   // Check the function attributes and profiles to find out if this function
10582   // should be optimized for size.
10583   ScalarEpilogueLowering SEL =
10584       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10585 
10586   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10587   // count by optimizing for size, to minimize overheads.
10588   auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10589   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10590     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10591                       << "This loop is worth vectorizing only if no scalar "
10592                       << "iteration overheads are incurred.");
10593     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10594       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10595     else {
10596       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10597         LLVM_DEBUG(dbgs() << "\n");
10598         // Predicate tail-folded loops are efficient even when the loop
10599         // iteration count is low. However, setting the epilogue policy to
10600         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10601         // with runtime checks. It's more effective to let
10602         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10603         // for the loop.
10604         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10605           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10606       } else {
10607         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10608                              "small to consider vectorizing.\n");
10609         reportVectorizationFailure(
10610             "The trip count is below the minial threshold value.",
10611             "loop trip count is too low, avoiding vectorization",
10612             "LowTripCount", ORE, L);
10613         Hints.emitRemarkWithHints();
10614         return false;
10615       }
10616     }
10617   }
10618 
10619   // Check the function attributes to see if implicit floats or vectors are
10620   // allowed.
10621   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10622     reportVectorizationFailure(
10623         "Can't vectorize when the NoImplicitFloat attribute is used",
10624         "loop not vectorized due to NoImplicitFloat attribute",
10625         "NoImplicitFloat", ORE, L);
10626     Hints.emitRemarkWithHints();
10627     return false;
10628   }
10629 
10630   // Check if the target supports potentially unsafe FP vectorization.
10631   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10632   // for the target we're vectorizing for, to make sure none of the
10633   // additional fp-math flags can help.
10634   if (Hints.isPotentiallyUnsafe() &&
10635       TTI->isFPVectorizationPotentiallyUnsafe()) {
10636     reportVectorizationFailure(
10637         "Potentially unsafe FP op prevents vectorization",
10638         "loop not vectorized due to unsafe FP support.",
10639         "UnsafeFP", ORE, L);
10640     Hints.emitRemarkWithHints();
10641     return false;
10642   }
10643 
10644   bool AllowOrderedReductions;
10645   // If the flag is set, use that instead and override the TTI behaviour.
10646   if (ForceOrderedReductions.getNumOccurrences() > 0)
10647     AllowOrderedReductions = ForceOrderedReductions;
10648   else
10649     AllowOrderedReductions = TTI->enableOrderedReductions();
10650   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10651     ORE->emit([&]() {
10652       auto *ExactFPMathInst = Requirements.getExactFPInst();
10653       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10654                                                  ExactFPMathInst->getDebugLoc(),
10655                                                  ExactFPMathInst->getParent())
10656              << "loop not vectorized: cannot prove it is safe to reorder "
10657                 "floating-point operations";
10658     });
10659     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10660                          "reorder floating-point operations\n");
10661     Hints.emitRemarkWithHints();
10662     return false;
10663   }
10664 
10665   // Use the cost model.
10666   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10667                                 F, &Hints, IAI);
10668   // Use the planner for vectorization.
10669   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10670                                ORE);
10671 
10672   // Get user vectorization factor and interleave count.
10673   ElementCount UserVF = Hints.getWidth();
10674   unsigned UserIC = Hints.getInterleave();
10675 
10676   // Plan how to best vectorize.
10677   LVP.plan(UserVF, UserIC);
10678   VectorizationFactor VF = LVP.computeBestVF();
10679   unsigned IC = 1;
10680 
10681   if (ORE->allowExtraAnalysis(LV_NAME))
10682     LVP.emitInvalidCostRemarks(ORE);
10683 
10684   bool AddBranchWeights =
10685       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10686   GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10687                            AddBranchWeights);
10688   if (LVP.hasPlanWithVF(VF.Width)) {
10689     // Select the interleave count.
10690     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10691 
10692     unsigned SelectedIC = std::max(IC, UserIC);
10693     //  Optimistically generate runtime checks if they are needed. Drop them if
10694     //  they turn out to not be profitable.
10695     if (VF.Width.isVector() || SelectedIC > 1)
10696       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10697 
10698     // Check if it is profitable to vectorize with runtime checks.
10699     bool ForceVectorization =
10700         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10701     if (!ForceVectorization &&
10702         !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10703       ORE->emit([&]() {
10704         return OptimizationRemarkAnalysisAliasing(
10705                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10706                    L->getHeader())
10707                << "loop not vectorized: cannot prove it is safe to reorder "
10708                   "memory operations";
10709       });
10710       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10711       Hints.emitRemarkWithHints();
10712       return false;
10713     }
10714   }
10715 
10716   // Identify the diagnostic messages that should be produced.
10717   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10718   bool VectorizeLoop = true, InterleaveLoop = true;
10719   if (VF.Width.isScalar()) {
10720     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10721     VecDiagMsg = std::make_pair(
10722         "VectorizationNotBeneficial",
10723         "the cost-model indicates that vectorization is not beneficial");
10724     VectorizeLoop = false;
10725   }
10726 
10727   if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10728     // Tell the user interleaving was avoided up-front, despite being explicitly
10729     // requested.
10730     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10731                          "interleaving should be avoided up front\n");
10732     IntDiagMsg = std::make_pair(
10733         "InterleavingAvoided",
10734         "Ignoring UserIC, because interleaving was avoided up front");
10735     InterleaveLoop = false;
10736   } else if (IC == 1 && UserIC <= 1) {
10737     // Tell the user interleaving is not beneficial.
10738     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10739     IntDiagMsg = std::make_pair(
10740         "InterleavingNotBeneficial",
10741         "the cost-model indicates that interleaving is not beneficial");
10742     InterleaveLoop = false;
10743     if (UserIC == 1) {
10744       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10745       IntDiagMsg.second +=
10746           " and is explicitly disabled or interleave count is set to 1";
10747     }
10748   } else if (IC > 1 && UserIC == 1) {
10749     // Tell the user interleaving is beneficial, but it explicitly disabled.
10750     LLVM_DEBUG(
10751         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10752     IntDiagMsg = std::make_pair(
10753         "InterleavingBeneficialButDisabled",
10754         "the cost-model indicates that interleaving is beneficial "
10755         "but is explicitly disabled or interleave count is set to 1");
10756     InterleaveLoop = false;
10757   }
10758 
10759   // If there is a histogram in the loop, do not just interleave without
10760   // vectorizing. The order of operations will be incorrect without the
10761   // histogram intrinsics, which are only used for recipes with VF > 1.
10762   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10763     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10764                       << "to histogram operations.\n");
10765     IntDiagMsg = std::make_pair(
10766         "HistogramPreventsScalarInterleaving",
10767         "Unable to interleave without vectorization due to constraints on "
10768         "the order of histogram operations");
10769     InterleaveLoop = false;
10770   }
10771 
10772   // Override IC if user provided an interleave count.
10773   IC = UserIC > 0 ? UserIC : IC;
10774 
10775   // Emit diagnostic messages, if any.
10776   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10777   if (!VectorizeLoop && !InterleaveLoop) {
10778     // Do not vectorize or interleaving the loop.
10779     ORE->emit([&]() {
10780       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10781                                       L->getStartLoc(), L->getHeader())
10782              << VecDiagMsg.second;
10783     });
10784     ORE->emit([&]() {
10785       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10786                                       L->getStartLoc(), L->getHeader())
10787              << IntDiagMsg.second;
10788     });
10789     return false;
10790   }
10791 
10792   if (!VectorizeLoop && InterleaveLoop) {
10793     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10794     ORE->emit([&]() {
10795       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10796                                         L->getStartLoc(), L->getHeader())
10797              << VecDiagMsg.second;
10798     });
10799   } else if (VectorizeLoop && !InterleaveLoop) {
10800     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10801                       << ") in " << L->getLocStr() << '\n');
10802     ORE->emit([&]() {
10803       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10804                                         L->getStartLoc(), L->getHeader())
10805              << IntDiagMsg.second;
10806     });
10807   } else if (VectorizeLoop && InterleaveLoop) {
10808     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10809                       << ") in " << L->getLocStr() << '\n');
10810     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10811   }
10812 
10813   bool DisableRuntimeUnroll = false;
10814   MDNode *OrigLoopID = L->getLoopID();
10815   {
10816     using namespace ore;
10817     if (!VectorizeLoop) {
10818       assert(IC > 1 && "interleave count should not be 1 or 0");
10819       // If we decided that it is not legal to vectorize the loop, then
10820       // interleave it.
10821       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10822       InnerLoopVectorizer Unroller(
10823           L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10824           ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10825 
10826       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10827 
10828       ORE->emit([&]() {
10829         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10830                                   L->getHeader())
10831                << "interleaved loop (interleaved count: "
10832                << NV("InterleaveCount", IC) << ")";
10833       });
10834     } else {
10835       // If we decided that it is *legal* to vectorize the loop, then do it.
10836 
10837       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10838       // Consider vectorizing the epilogue too if it's profitable.
10839       VectorizationFactor EpilogueVF =
10840           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10841       if (EpilogueVF.Width.isVector()) {
10842         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10843 
10844         // The first pass vectorizes the main loop and creates a scalar epilogue
10845         // to be vectorized by executing the plan (potentially with a different
10846         // factor) again shortly afterwards.
10847         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10848         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10849         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10850                                           BestEpiPlan);
10851         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10852                                            EPI, &LVL, &CM, BFI, PSI, Checks,
10853                                            *BestMainPlan);
10854         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10855                                              *BestMainPlan, MainILV, DT, false);
10856         ++LoopsVectorized;
10857 
10858         // Second pass vectorizes the epilogue and adjusts the control flow
10859         // edges from the first pass.
10860         EPI.MainLoopVF = EPI.EpilogueVF;
10861         EPI.MainLoopUF = EPI.EpilogueUF;
10862         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10863                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10864                                                  Checks, BestEpiPlan);
10865         EpilogILV.setTripCount(MainILV.getTripCount());
10866         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10867 
10868         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10869                         DT, true, &ExpandedSCEVs);
10870         ++LoopsEpilogueVectorized;
10871 
10872         if (!MainILV.areSafetyChecksAdded())
10873           DisableRuntimeUnroll = true;
10874       } else {
10875         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10876                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10877                                PSI, Checks, BestPlan);
10878         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10879         ++LoopsVectorized;
10880 
10881         // Add metadata to disable runtime unrolling a scalar loop when there
10882         // are no runtime checks about strides and memory. A scalar loop that is
10883         // rarely used is not worth unrolling.
10884         if (!LB.areSafetyChecksAdded())
10885           DisableRuntimeUnroll = true;
10886       }
10887       // Report the vectorization decision.
10888       reportVectorization(ORE, L, VF, IC);
10889     }
10890 
10891     if (ORE->allowExtraAnalysis(LV_NAME))
10892       checkMixedPrecision(L, ORE);
10893   }
10894 
10895   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10896          "DT not preserved correctly");
10897 
10898   std::optional<MDNode *> RemainderLoopID =
10899       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10900                                       LLVMLoopVectorizeFollowupEpilogue});
10901   if (RemainderLoopID) {
10902     L->setLoopID(*RemainderLoopID);
10903   } else {
10904     if (DisableRuntimeUnroll)
10905       addRuntimeUnrollDisableMetaData(L);
10906 
10907     // Mark the loop as already vectorized to avoid vectorizing again.
10908     Hints.setAlreadyVectorized();
10909   }
10910 
10911   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10912   return true;
10913 }
10914 
10915 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10916 
10917   // Don't attempt if
10918   // 1. the target claims to have no vector registers, and
10919   // 2. interleaving won't help ILP.
10920   //
10921   // The second condition is necessary because, even if the target has no
10922   // vector registers, loop vectorization may still enable scalar
10923   // interleaving.
10924   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10925       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10926     return LoopVectorizeResult(false, false);
10927 
10928   bool Changed = false, CFGChanged = false;
10929 
10930   // The vectorizer requires loops to be in simplified form.
10931   // Since simplification may add new inner loops, it has to run before the
10932   // legality and profitability checks. This means running the loop vectorizer
10933   // will simplify all loops, regardless of whether anything end up being
10934   // vectorized.
10935   for (const auto &L : *LI)
10936     Changed |= CFGChanged |=
10937         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10938 
10939   // Build up a worklist of inner-loops to vectorize. This is necessary as
10940   // the act of vectorizing or partially unrolling a loop creates new loops
10941   // and can invalidate iterators across the loops.
10942   SmallVector<Loop *, 8> Worklist;
10943 
10944   for (Loop *L : *LI)
10945     collectSupportedLoops(*L, LI, ORE, Worklist);
10946 
10947   LoopsAnalyzed += Worklist.size();
10948 
10949   // Now walk the identified inner loops.
10950   while (!Worklist.empty()) {
10951     Loop *L = Worklist.pop_back_val();
10952 
10953     // For the inner loops we actually process, form LCSSA to simplify the
10954     // transform.
10955     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10956 
10957     Changed |= CFGChanged |= processLoop(L);
10958 
10959     if (Changed) {
10960       LAIs->clear();
10961 
10962 #ifndef NDEBUG
10963       if (VerifySCEV)
10964         SE->verify();
10965 #endif
10966     }
10967   }
10968 
10969   // Process each loop nest in the function.
10970   return LoopVectorizeResult(Changed, CFGChanged);
10971 }
10972 
10973 PreservedAnalyses LoopVectorizePass::run(Function &F,
10974                                          FunctionAnalysisManager &AM) {
10975   LI = &AM.getResult<LoopAnalysis>(F);
10976   // There are no loops in the function. Return before computing other
10977   // expensive analyses.
10978   if (LI->empty())
10979     return PreservedAnalyses::all();
10980   SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
10981   TTI = &AM.getResult<TargetIRAnalysis>(F);
10982   DT = &AM.getResult<DominatorTreeAnalysis>(F);
10983   TLI = &AM.getResult<TargetLibraryAnalysis>(F);
10984   AC = &AM.getResult<AssumptionAnalysis>(F);
10985   DB = &AM.getResult<DemandedBitsAnalysis>(F);
10986   ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10987   LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10988 
10989   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10990   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10991   BFI = nullptr;
10992   if (PSI && PSI->hasProfileSummary())
10993     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10994   LoopVectorizeResult Result = runImpl(F);
10995   if (!Result.MadeAnyChange)
10996     return PreservedAnalyses::all();
10997   PreservedAnalyses PA;
10998 
10999   if (isAssignmentTrackingEnabled(*F.getParent())) {
11000     for (auto &BB : F)
11001       RemoveRedundantDbgInstrs(&BB);
11002   }
11003 
11004   PA.preserve<LoopAnalysis>();
11005   PA.preserve<DominatorTreeAnalysis>();
11006   PA.preserve<ScalarEvolutionAnalysis>();
11007   PA.preserve<LoopAccessAnalysis>();
11008 
11009   if (Result.MadeCFGChange) {
11010     // Making CFG changes likely means a loop got vectorized. Indicate that
11011     // extra simplification passes should be run.
11012     // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
11013     // be run if runtime checks have been added.
11014     AM.getResult<ShouldRunExtraVectorPasses>(F);
11015     PA.preserve<ShouldRunExtraVectorPasses>();
11016   } else {
11017     PA.preserveSet<CFGAnalyses>();
11018   }
11019   return PA;
11020 }
11021 
11022 void LoopVectorizePass::printPipeline(
11023     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
11024   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
11025       OS, MapClassName2PassName);
11026 
11027   OS << '<';
11028   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
11029   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
11030   OS << '>';
11031 }
11032