xref: /llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 3397950f2d21426c7520d114a12588128906a897)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanPatternMatch.h"
63 #include "VPlanTransforms.h"
64 #include "VPlanUtils.h"
65 #include "VPlanVerifier.h"
66 #include "llvm/ADT/APInt.h"
67 #include "llvm/ADT/ArrayRef.h"
68 #include "llvm/ADT/DenseMap.h"
69 #include "llvm/ADT/DenseMapInfo.h"
70 #include "llvm/ADT/Hashing.h"
71 #include "llvm/ADT/MapVector.h"
72 #include "llvm/ADT/STLExtras.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/TypeSwitch.h"
79 #include "llvm/ADT/iterator_range.h"
80 #include "llvm/Analysis/AssumptionCache.h"
81 #include "llvm/Analysis/BasicAliasAnalysis.h"
82 #include "llvm/Analysis/BlockFrequencyInfo.h"
83 #include "llvm/Analysis/CFG.h"
84 #include "llvm/Analysis/CodeMetrics.h"
85 #include "llvm/Analysis/DemandedBits.h"
86 #include "llvm/Analysis/GlobalsModRef.h"
87 #include "llvm/Analysis/LoopAccessAnalysis.h"
88 #include "llvm/Analysis/LoopAnalysisManager.h"
89 #include "llvm/Analysis/LoopInfo.h"
90 #include "llvm/Analysis/LoopIterator.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/ValueTracking.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfo.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/MDBuilder.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/PatternMatch.h"
122 #include "llvm/IR/ProfDataUtils.h"
123 #include "llvm/IR/Type.h"
124 #include "llvm/IR/Use.h"
125 #include "llvm/IR/User.h"
126 #include "llvm/IR/Value.h"
127 #include "llvm/IR/Verifier.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/NativeFormatting.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/Local.h"
139 #include "llvm/Transforms/Utils/LoopSimplify.h"
140 #include "llvm/Transforms/Utils/LoopUtils.h"
141 #include "llvm/Transforms/Utils/LoopVersioning.h"
142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143 #include "llvm/Transforms/Utils/SizeOpts.h"
144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <algorithm>
146 #include <cassert>
147 #include <cstdint>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251                    "data-and-control-without-rt-check",
252                    "Similar to data-and-control, but remove the runtime check"),
253         clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254                    "Use predicated EVL instructions for tail folding. If EVL "
255                    "is unsupported, fallback to data-without-lane-mask.")));
256 
257 static cl::opt<bool> MaximizeBandwidth(
258     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259     cl::desc("Maximize bandwidth when selecting vectorization factor which "
260              "will be determined by the smallest type in loop."));
261 
262 static cl::opt<bool> EnableInterleavedMemAccesses(
263     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265 
266 /// An interleave-group may need masking if it resides in a block that needs
267 /// predication, or in order to mask away gaps.
268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
269     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271 
272 static cl::opt<unsigned> ForceTargetNumScalarRegs(
273     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's number of scalar registers."));
275 
276 static cl::opt<unsigned> ForceTargetNumVectorRegs(
277     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278     cl::desc("A flag that overrides the target's number of vector registers."));
279 
280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
281     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282     cl::desc("A flag that overrides the target's max interleave factor for "
283              "scalar loops."));
284 
285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
286     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287     cl::desc("A flag that overrides the target's max interleave factor for "
288              "vectorized loops."));
289 
290 cl::opt<unsigned> ForceTargetInstructionCost(
291     "force-target-instruction-cost", cl::init(0), cl::Hidden,
292     cl::desc("A flag that overrides the target's expected cost for "
293              "an instruction to a single constant value. Mostly "
294              "useful for getting consistent testing."));
295 
296 static cl::opt<bool> ForceTargetSupportsScalableVectors(
297     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298     cl::desc(
299         "Pretend that scalable vectors are supported, even if the target does "
300         "not support them. This flag should only be used for testing."));
301 
302 static cl::opt<unsigned> SmallLoopCost(
303     "small-loop-cost", cl::init(20), cl::Hidden,
304     cl::desc(
305         "The cost of a loop that is considered 'small' by the interleaver."));
306 
307 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
308     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309     cl::desc("Enable the use of the block frequency analysis to access PGO "
310              "heuristics minimizing code growth in cold regions and being more "
311              "aggressive in hot regions."));
312 
313 // Runtime interleave loops for load/store throughput.
314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
315     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316     cl::desc(
317         "Enable runtime interleaving until load/store ports are saturated"));
318 
319 /// The number of stores in a loop that are allowed to need predication.
320 static cl::opt<unsigned> NumberOfStoresToPredicate(
321     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322     cl::desc("Max number of stores to be predicated behind an if."));
323 
324 static cl::opt<bool> EnableIndVarRegisterHeur(
325     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326     cl::desc("Count the induction variable only once when interleaving"));
327 
328 static cl::opt<bool> EnableCondStoresVectorization(
329     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330     cl::desc("Enable if predication of stores during vectorization."));
331 
332 static cl::opt<unsigned> MaxNestedScalarReductionIC(
333     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334     cl::desc("The maximum interleave count to use when interleaving a scalar "
335              "reduction in a nested loop."));
336 
337 static cl::opt<bool>
338     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
339                            cl::Hidden,
340                            cl::desc("Prefer in-loop vector reductions, "
341                                     "overriding the targets preference."));
342 
343 static cl::opt<bool> ForceOrderedReductions(
344     "force-ordered-reductions", cl::init(false), cl::Hidden,
345     cl::desc("Enable the vectorisation of loops with in-order (strict) "
346              "FP reductions"));
347 
348 static cl::opt<bool> PreferPredicatedReductionSelect(
349     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350     cl::desc(
351         "Prefer predicating a reduction operation over an after loop select."));
352 
353 namespace llvm {
354 cl::opt<bool> EnableVPlanNativePath(
355     "enable-vplan-native-path", cl::Hidden,
356     cl::desc("Enable VPlan-native vectorization path with "
357              "support for outer loop vectorization."));
358 } // namespace llvm
359 
360 // This flag enables the stress testing of the VPlan H-CFG construction in the
361 // VPlan-native vectorization path. It must be used in conjuction with
362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363 // verification of the H-CFGs built.
364 static cl::opt<bool> VPlanBuildStressTest(
365     "vplan-build-stress-test", cl::init(false), cl::Hidden,
366     cl::desc(
367         "Build VPlan for every supported loop nest in the function and bail "
368         "out right after the build (stress test the VPlan H-CFG construction "
369         "in the VPlan-native vectorization path)."));
370 
371 cl::opt<bool> llvm::EnableLoopInterleaving(
372     "interleave-loops", cl::init(true), cl::Hidden,
373     cl::desc("Enable loop interleaving in Loop vectorization passes"));
374 cl::opt<bool> llvm::EnableLoopVectorization(
375     "vectorize-loops", cl::init(true), cl::Hidden,
376     cl::desc("Run the Loop vectorization passes"));
377 
378 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
379     "force-widen-divrem-via-safe-divisor", cl::Hidden,
380     cl::desc(
381         "Override cost based safe divisor widening for div/rem instructions"));
382 
383 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
384     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
385     cl::Hidden,
386     cl::desc("Try wider VFs if they enable the use of vector variants"));
387 
388 static cl::opt<bool> EnableEarlyExitVectorization(
389     "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390     cl::desc(
391         "Enable vectorization of early exit loops with uncountable exits."));
392 
393 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394 // variables not overflowing do not hold. See `emitSCEVChecks`.
395 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
397 // `emitMemRuntimeChecks`.
398 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399 // Likelyhood of bypassing the vectorized loop because there are zero trips left
400 // after prolog. See `emitIterationCountCheck`.
401 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402 
403 /// A helper function that returns true if the given type is irregular. The
404 /// type is irregular if its allocated size doesn't equal the store size of an
405 /// element of the corresponding vector type.
406 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407   // Determine if an array of N elements of type Ty is "bitcast compatible"
408   // with a <N x Ty> vector.
409   // This is only true if there is no padding between the array elements.
410   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411 }
412 
413 /// Returns "best known" trip count for the specified loop \p L as defined by
414 /// the following procedure:
415 ///   1) Returns exact trip count if it is known.
416 ///   2) Returns expected trip count according to profile data if any.
417 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
418 ///   4) Returns std::nullopt if all of the above failed.
419 static std::optional<unsigned>
420 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
421                     bool CanUseConstantMax = true) {
422   // Check if exact trip count is known.
423   if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
424     return ExpectedTC;
425 
426   // Check if there is an expected trip count available from profile data.
427   if (LoopVectorizeWithBlockFrequency)
428     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
429       return *EstimatedTC;
430 
431   if (!CanUseConstantMax)
432     return std::nullopt;
433 
434   // Check if upper bound estimate is known.
435   if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
436     return ExpectedTC;
437 
438   return std::nullopt;
439 }
440 
441 namespace {
442 // Forward declare GeneratedRTChecks.
443 class GeneratedRTChecks;
444 
445 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446 } // namespace
447 
448 namespace llvm {
449 
450 AnalysisKey ShouldRunExtraVectorPasses::Key;
451 
452 /// InnerLoopVectorizer vectorizes loops which contain only one basic
453 /// block to a specified vectorization factor (VF).
454 /// This class performs the widening of scalars into vectors, or multiple
455 /// scalars. This class also implements the following features:
456 /// * It inserts an epilogue loop for handling loops that don't have iteration
457 ///   counts that are known to be a multiple of the vectorization factor.
458 /// * It handles the code generation for reduction variables.
459 /// * Scalarization (implementation using scalars) of un-vectorizable
460 ///   instructions.
461 /// InnerLoopVectorizer does not perform any vectorization-legality
462 /// checks, and relies on the caller to check for the different legality
463 /// aspects. The InnerLoopVectorizer relies on the
464 /// LoopVectorizationLegality class to provide information about the induction
465 /// and reduction variables that were found to a given vectorization factor.
466 class InnerLoopVectorizer {
467 public:
468   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
469                       LoopInfo *LI, DominatorTree *DT,
470                       const TargetLibraryInfo *TLI,
471                       const TargetTransformInfo *TTI, AssumptionCache *AC,
472                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
473                       ElementCount MinProfitableTripCount,
474                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
475                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
476                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
477                       VPlan &Plan)
478       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
479         AC(AC), ORE(ORE), VF(VecWidth),
480         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
481         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
482         PSI(PSI), RTChecks(RTChecks), Plan(Plan),
483         VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
484     // Query this against the original loop and save it here because the profile
485     // of the original loop header may change as the transformation happens.
486     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
487         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
488   }
489 
490   virtual ~InnerLoopVectorizer() = default;
491 
492   /// Create a new empty loop that will contain vectorized instructions later
493   /// on, while the old loop will be used as the scalar remainder. Control flow
494   /// is generated around the vectorized (and scalar epilogue) loops consisting
495   /// of various checks and bypasses. Return the pre-header block of the new
496   /// loop. In the case of epilogue vectorization, this function is overriden to
497   /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
498   /// used to look up SCEV expansions for expressions needed during skeleton
499   /// creation.
500   virtual BasicBlock *
501   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502 
503   /// Fix the vectorized code, taking care of header phi's, and more.
504   void fixVectorizedLoop(VPTransformState &State);
505 
506   // Return true if any runtime check is added.
507   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
508 
509   /// A helper function to scalarize a single Instruction in the innermost loop.
510   /// Generates a sequence of scalar instances for each lane between \p MinLane
511   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
512   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
513   /// Instr's operands.
514   void scalarizeInstruction(const Instruction *Instr,
515                             VPReplicateRecipe *RepRecipe, const VPLane &Lane,
516                             VPTransformState &State);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPTransformState &State);
520 
521   /// Returns the original loop trip count.
522   Value *getTripCount() const { return TripCount; }
523 
524   /// Used to set the trip count after ILV's construction and after the
525   /// preheader block has been executed. Note that this always holds the trip
526   /// count of the original loop for both main loop and epilogue vectorization.
527   void setTripCount(Value *TC) { TripCount = TC; }
528 
529   // Retrieve the additional bypass value associated with an original
530   /// induction header phi.
531   Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
532     return Induction2AdditionalBypassValue.at(OrigPhi);
533   }
534 
535   /// Return the additional bypass block which targets the scalar loop by
536   /// skipping the epilogue loop after completing the main loop.
537   BasicBlock *getAdditionalBypassBlock() const {
538     assert(AdditionalBypassBlock &&
539            "Trying to access AdditionalBypassBlock but it has not been set");
540     return AdditionalBypassBlock;
541   }
542 
543 protected:
544   friend class LoopVectorizationPlanner;
545 
546   /// Set up the values of the IVs correctly when exiting the vector loop.
547   virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
548                             Value *VectorTripCount, BasicBlock *MiddleBlock,
549                             VPTransformState &State);
550 
551   /// Iteratively sink the scalarized operands of a predicated instruction into
552   /// the block that was created for it.
553   void sinkScalarOperands(Instruction *PredInst);
554 
555   /// Returns (and creates if needed) the trip count of the widened loop.
556   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
557 
558   /// Emit a bypass check to see if the vector trip count is zero, including if
559   /// it overflows.
560   void emitIterationCountCheck(BasicBlock *Bypass);
561 
562   /// Emit a bypass check to see if all of the SCEV assumptions we've
563   /// had to make are correct. Returns the block containing the checks or
564   /// nullptr if no checks have been added.
565   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
566 
567   /// Emit bypass checks to check any memory assumptions we may have made.
568   /// Returns the block containing the checks or nullptr if no checks have been
569   /// added.
570   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
571 
572   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
573   /// vector loop preheader, middle block and scalar preheader.
574   void createVectorLoopSkeleton(StringRef Prefix);
575 
576   /// Create and record the values for induction variables to resume coming from
577   /// the additional bypass block.
578   void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
579                                              Value *MainVectorTripCount);
580 
581   /// Allow subclasses to override and print debug traces before/after vplan
582   /// execution, when trace information is requested.
583   virtual void printDebugTracesAtStart() {}
584   virtual void printDebugTracesAtEnd() {}
585 
586   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
587   /// vector preheader and its predecessor, also connecting the new block to the
588   /// scalar preheader.
589   void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
590 
591   /// The original loop.
592   Loop *OrigLoop;
593 
594   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
595   /// dynamic knowledge to simplify SCEV expressions and converts them to a
596   /// more usable form.
597   PredicatedScalarEvolution &PSE;
598 
599   /// Loop Info.
600   LoopInfo *LI;
601 
602   /// Dominator Tree.
603   DominatorTree *DT;
604 
605   /// Target Library Info.
606   const TargetLibraryInfo *TLI;
607 
608   /// Target Transform Info.
609   const TargetTransformInfo *TTI;
610 
611   /// Assumption Cache.
612   AssumptionCache *AC;
613 
614   /// Interface to emit optimization remarks.
615   OptimizationRemarkEmitter *ORE;
616 
617   /// The vectorization SIMD factor to use. Each vector will have this many
618   /// vector elements.
619   ElementCount VF;
620 
621   ElementCount MinProfitableTripCount;
622 
623   /// The vectorization unroll factor to use. Each scalar is vectorized to this
624   /// many different vector instructions.
625   unsigned UF;
626 
627   /// The builder that we use
628   IRBuilder<> Builder;
629 
630   // --- Vectorization state ---
631 
632   /// The vector-loop preheader.
633   BasicBlock *LoopVectorPreHeader;
634 
635   /// The scalar-loop preheader.
636   BasicBlock *LoopScalarPreHeader;
637 
638   /// Middle Block between the vector and the scalar.
639   BasicBlock *LoopMiddleBlock;
640 
641   /// A list of all bypass blocks. The first block is the entry of the loop.
642   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
643 
644   /// Store instructions that were predicated.
645   SmallVector<Instruction *, 4> PredicatedInstructions;
646 
647   /// Trip count of the original loop.
648   Value *TripCount = nullptr;
649 
650   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
651   Value *VectorTripCount = nullptr;
652 
653   /// The legality analysis.
654   LoopVectorizationLegality *Legal;
655 
656   /// The profitablity analysis.
657   LoopVectorizationCostModel *Cost;
658 
659   // Record whether runtime checks are added.
660   bool AddedSafetyChecks = false;
661 
662   /// BFI and PSI are used to check for profile guided size optimizations.
663   BlockFrequencyInfo *BFI;
664   ProfileSummaryInfo *PSI;
665 
666   // Whether this loop should be optimized for size based on profile guided size
667   // optimizatios.
668   bool OptForSizeBasedOnProfile;
669 
670   /// Structure to hold information about generated runtime checks, responsible
671   /// for cleaning the checks, if vectorization turns out unprofitable.
672   GeneratedRTChecks &RTChecks;
673 
674   /// Mapping of induction phis to their additional bypass values. They
675   /// need to be added as operands to phi nodes in the scalar loop preheader
676   /// after the epilogue skeleton has been created.
677   DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
678 
679   /// The additional bypass block which conditionally skips over the epilogue
680   /// loop after executing the main loop. Needed to resume inductions and
681   /// reductions during epilogue vectorization.
682   BasicBlock *AdditionalBypassBlock = nullptr;
683 
684   VPlan &Plan;
685 
686   /// The vector preheader block of \p Plan, used as target for check blocks
687   /// introduced during skeleton creation.
688   VPBlockBase *VectorPHVPB;
689 };
690 
691 /// Encapsulate information regarding vectorization of a loop and its epilogue.
692 /// This information is meant to be updated and used across two stages of
693 /// epilogue vectorization.
694 struct EpilogueLoopVectorizationInfo {
695   ElementCount MainLoopVF = ElementCount::getFixed(0);
696   unsigned MainLoopUF = 0;
697   ElementCount EpilogueVF = ElementCount::getFixed(0);
698   unsigned EpilogueUF = 0;
699   BasicBlock *MainLoopIterationCountCheck = nullptr;
700   BasicBlock *EpilogueIterationCountCheck = nullptr;
701   BasicBlock *SCEVSafetyCheck = nullptr;
702   BasicBlock *MemSafetyCheck = nullptr;
703   Value *TripCount = nullptr;
704   Value *VectorTripCount = nullptr;
705   VPlan &EpiloguePlan;
706 
707   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
708                                 ElementCount EVF, unsigned EUF,
709                                 VPlan &EpiloguePlan)
710       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
711         EpiloguePlan(EpiloguePlan) {
712     assert(EUF == 1 &&
713            "A high UF for the epilogue loop is likely not beneficial.");
714   }
715 };
716 
717 /// An extension of the inner loop vectorizer that creates a skeleton for a
718 /// vectorized loop that has its epilogue (residual) also vectorized.
719 /// The idea is to run the vplan on a given loop twice, firstly to setup the
720 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
721 /// from the first step and vectorize the epilogue.  This is achieved by
722 /// deriving two concrete strategy classes from this base class and invoking
723 /// them in succession from the loop vectorizer planner.
724 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
725 public:
726   InnerLoopAndEpilogueVectorizer(
727       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
728       DominatorTree *DT, const TargetLibraryInfo *TLI,
729       const TargetTransformInfo *TTI, AssumptionCache *AC,
730       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
731       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
732       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
733       GeneratedRTChecks &Checks, VPlan &Plan)
734       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
735                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
736                             CM, BFI, PSI, Checks, Plan),
737         EPI(EPI) {}
738 
739   // Override this function to handle the more complex control flow around the
740   // three loops.
741   BasicBlock *
742   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
743     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
744   }
745 
746   /// The interface for creating a vectorized skeleton using one of two
747   /// different strategies, each corresponding to one execution of the vplan
748   /// as described above.
749   virtual BasicBlock *
750   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
751 
752   /// Holds and updates state information required to vectorize the main loop
753   /// and its epilogue in two separate passes. This setup helps us avoid
754   /// regenerating and recomputing runtime safety checks. It also helps us to
755   /// shorten the iteration-count-check path length for the cases where the
756   /// iteration count of the loop is so small that the main vector loop is
757   /// completely skipped.
758   EpilogueLoopVectorizationInfo &EPI;
759 };
760 
761 /// A specialized derived class of inner loop vectorizer that performs
762 /// vectorization of *main* loops in the process of vectorizing loops and their
763 /// epilogues.
764 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
765 public:
766   EpilogueVectorizerMainLoop(
767       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
768       DominatorTree *DT, const TargetLibraryInfo *TLI,
769       const TargetTransformInfo *TTI, AssumptionCache *AC,
770       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
771       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
772       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
773       GeneratedRTChecks &Check, VPlan &Plan)
774       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
775                                        EPI, LVL, CM, BFI, PSI, Check, Plan) {}
776   /// Implements the interface for creating a vectorized skeleton using the
777   /// *main loop* strategy (ie the first pass of vplan execution).
778   BasicBlock *
779   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
780 
781 protected:
782   /// Emits an iteration count bypass check once for the main loop (when \p
783   /// ForEpilogue is false) and once for the epilogue loop (when \p
784   /// ForEpilogue is true).
785   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
786   void printDebugTracesAtStart() override;
787   void printDebugTracesAtEnd() override;
788 
789   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
790                     Value *VectorTripCount, BasicBlock *MiddleBlock,
791                     VPTransformState &State) override {};
792 };
793 
794 // A specialized derived class of inner loop vectorizer that performs
795 // vectorization of *epilogue* loops in the process of vectorizing loops and
796 // their epilogues.
797 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
798 public:
799   EpilogueVectorizerEpilogueLoop(
800       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
801       DominatorTree *DT, const TargetLibraryInfo *TLI,
802       const TargetTransformInfo *TTI, AssumptionCache *AC,
803       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
804       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
805       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
806       GeneratedRTChecks &Checks, VPlan &Plan)
807       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
808                                        EPI, LVL, CM, BFI, PSI, Checks, Plan) {
809     TripCount = EPI.TripCount;
810   }
811   /// Implements the interface for creating a vectorized skeleton using the
812   /// *epilogue loop* strategy (ie the second pass of vplan execution).
813   BasicBlock *
814   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
815 
816 protected:
817   /// Emits an iteration count bypass check after the main vector loop has
818   /// finished to see if there are any iterations left to execute by either
819   /// the vector epilogue or the scalar epilogue.
820   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
821                                                       BasicBlock *Bypass,
822                                                       BasicBlock *Insert);
823   void printDebugTracesAtStart() override;
824   void printDebugTracesAtEnd() override;
825 };
826 } // end namespace llvm
827 
828 /// Look for a meaningful debug location on the instruction or its operands.
829 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
830   if (!I)
831     return DebugLoc();
832 
833   DebugLoc Empty;
834   if (I->getDebugLoc() != Empty)
835     return I->getDebugLoc();
836 
837   for (Use &Op : I->operands()) {
838     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
839       if (OpInst->getDebugLoc() != Empty)
840         return OpInst->getDebugLoc();
841   }
842 
843   return I->getDebugLoc();
844 }
845 
846 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
847 /// is passed, the message relates to that particular instruction.
848 #ifndef NDEBUG
849 static void debugVectorizationMessage(const StringRef Prefix,
850                                       const StringRef DebugMsg,
851                                       Instruction *I) {
852   dbgs() << "LV: " << Prefix << DebugMsg;
853   if (I != nullptr)
854     dbgs() << " " << *I;
855   else
856     dbgs() << '.';
857   dbgs() << '\n';
858 }
859 #endif
860 
861 /// Create an analysis remark that explains why vectorization failed
862 ///
863 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
864 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
865 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
866 /// the location of the remark. If \p DL is passed, use it as debug location for
867 /// the remark. \return the remark object that can be streamed to.
868 static OptimizationRemarkAnalysis
869 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
870                  Instruction *I, DebugLoc DL = {}) {
871   Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
872   // If debug location is attached to the instruction, use it. Otherwise if DL
873   // was not provided, use the loop's.
874   if (I && I->getDebugLoc())
875     DL = I->getDebugLoc();
876   else if (!DL)
877     DL = TheLoop->getStartLoc();
878 
879   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
880 }
881 
882 namespace llvm {
883 
884 /// Return a value for Step multiplied by VF.
885 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
886                        int64_t Step) {
887   assert(Ty->isIntegerTy() && "Expected an integer step");
888   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
889 }
890 
891 /// Return the runtime value for VF.
892 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
893   return B.CreateElementCount(Ty, VF);
894 }
895 
896 void reportVectorizationFailure(const StringRef DebugMsg,
897                                 const StringRef OREMsg, const StringRef ORETag,
898                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
899                                 Instruction *I) {
900   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
901   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
902   ORE->emit(
903       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
904       << "loop not vectorized: " << OREMsg);
905 }
906 
907 /// Reports an informative message: print \p Msg for debugging purposes as well
908 /// as an optimization remark. Uses either \p I as location of the remark, or
909 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
910 /// remark. If \p DL is passed, use it as debug location for the remark.
911 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
912                                     OptimizationRemarkEmitter *ORE,
913                                     Loop *TheLoop, Instruction *I = nullptr,
914                                     DebugLoc DL = {}) {
915   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
916   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
917   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
918                              I, DL)
919             << Msg);
920 }
921 
922 /// Report successful vectorization of the loop. In case an outer loop is
923 /// vectorized, prepend "outer" to the vectorization remark.
924 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
925                                 VectorizationFactor VF, unsigned IC) {
926   LLVM_DEBUG(debugVectorizationMessage(
927       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
928       nullptr));
929   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
930   ORE->emit([&]() {
931     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
932                               TheLoop->getHeader())
933            << "vectorized " << LoopType << "loop (vectorization width: "
934            << ore::NV("VectorizationFactor", VF.Width)
935            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
936   });
937 }
938 
939 } // end namespace llvm
940 
941 namespace llvm {
942 
943 // Loop vectorization cost-model hints how the scalar epilogue loop should be
944 // lowered.
945 enum ScalarEpilogueLowering {
946 
947   // The default: allowing scalar epilogues.
948   CM_ScalarEpilogueAllowed,
949 
950   // Vectorization with OptForSize: don't allow epilogues.
951   CM_ScalarEpilogueNotAllowedOptSize,
952 
953   // A special case of vectorisation with OptForSize: loops with a very small
954   // trip count are considered for vectorization under OptForSize, thereby
955   // making sure the cost of their loop body is dominant, free of runtime
956   // guards and scalar iteration overheads.
957   CM_ScalarEpilogueNotAllowedLowTripLoop,
958 
959   // Loop hint predicate indicating an epilogue is undesired.
960   CM_ScalarEpilogueNotNeededUsePredicate,
961 
962   // Directive indicating we must either tail fold or not vectorize
963   CM_ScalarEpilogueNotAllowedUsePredicate
964 };
965 
966 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
967 
968 /// LoopVectorizationCostModel - estimates the expected speedups due to
969 /// vectorization.
970 /// In many cases vectorization is not profitable. This can happen because of
971 /// a number of reasons. In this class we mainly attempt to predict the
972 /// expected speedup/slowdowns due to the supported instruction set. We use the
973 /// TargetTransformInfo to query the different backends for the cost of
974 /// different operations.
975 class LoopVectorizationCostModel {
976   friend class LoopVectorizationPlanner;
977 
978 public:
979   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
980                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
981                              LoopVectorizationLegality *Legal,
982                              const TargetTransformInfo &TTI,
983                              const TargetLibraryInfo *TLI, DemandedBits *DB,
984                              AssumptionCache *AC,
985                              OptimizationRemarkEmitter *ORE, const Function *F,
986                              const LoopVectorizeHints *Hints,
987                              InterleavedAccessInfo &IAI)
988       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
989         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
990         Hints(Hints), InterleaveInfo(IAI) {}
991 
992   /// \return An upper bound for the vectorization factors (both fixed and
993   /// scalable). If the factors are 0, vectorization and interleaving should be
994   /// avoided up front.
995   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
996 
997   /// \return True if runtime checks are required for vectorization, and false
998   /// otherwise.
999   bool runtimeChecksRequired();
1000 
1001   /// Setup cost-based decisions for user vectorization factor.
1002   /// \return true if the UserVF is a feasible VF to be chosen.
1003   bool selectUserVectorizationFactor(ElementCount UserVF) {
1004     collectUniformsAndScalars(UserVF);
1005     collectInstsToScalarize(UserVF);
1006     return expectedCost(UserVF).isValid();
1007   }
1008 
1009   /// \return The size (in bits) of the smallest and widest types in the code
1010   /// that needs to be vectorized. We ignore values that remain scalar such as
1011   /// 64 bit loop indices.
1012   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1013 
1014   /// \return The desired interleave count.
1015   /// If interleave count has been specified by metadata it will be returned.
1016   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1017   /// are the selected vectorization factor and the cost of the selected VF.
1018   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1019 
1020   /// Memory access instruction may be vectorized in more than one way.
1021   /// Form of instruction after vectorization depends on cost.
1022   /// This function takes cost-based decisions for Load/Store instructions
1023   /// and collects them in a map. This decisions map is used for building
1024   /// the lists of loop-uniform and loop-scalar instructions.
1025   /// The calculated cost is saved with widening decision in order to
1026   /// avoid redundant calculations.
1027   void setCostBasedWideningDecision(ElementCount VF);
1028 
1029   /// A call may be vectorized in different ways depending on whether we have
1030   /// vectorized variants available and whether the target supports masking.
1031   /// This function analyzes all calls in the function at the supplied VF,
1032   /// makes a decision based on the costs of available options, and stores that
1033   /// decision in a map for use in planning and plan execution.
1034   void setVectorizedCallDecision(ElementCount VF);
1035 
1036   /// A struct that represents some properties of the register usage
1037   /// of a loop.
1038   struct RegisterUsage {
1039     /// Holds the number of loop invariant values that are used in the loop.
1040     /// The key is ClassID of target-provided register class.
1041     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1042     /// Holds the maximum number of concurrent live intervals in the loop.
1043     /// The key is ClassID of target-provided register class.
1044     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1045   };
1046 
1047   /// \return Returns information about the register usages of the loop for the
1048   /// given vectorization factors.
1049   SmallVector<RegisterUsage, 8>
1050   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1051 
1052   /// Collect values we want to ignore in the cost model.
1053   void collectValuesToIgnore();
1054 
1055   /// Collect all element types in the loop for which widening is needed.
1056   void collectElementTypesForWidening();
1057 
1058   /// Split reductions into those that happen in the loop, and those that happen
1059   /// outside. In loop reductions are collected into InLoopReductions.
1060   void collectInLoopReductions();
1061 
1062   /// Returns true if we should use strict in-order reductions for the given
1063   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1064   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1065   /// of FP operations.
1066   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1067     return !Hints->allowReordering() && RdxDesc.isOrdered();
1068   }
1069 
1070   /// \returns The smallest bitwidth each instruction can be represented with.
1071   /// The vector equivalents of these instructions should be truncated to this
1072   /// type.
1073   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1074     return MinBWs;
1075   }
1076 
1077   /// \returns True if it is more profitable to scalarize instruction \p I for
1078   /// vectorization factor \p VF.
1079   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1080     assert(VF.isVector() &&
1081            "Profitable to scalarize relevant only for VF > 1.");
1082     assert(
1083         TheLoop->isInnermost() &&
1084         "cost-model should not be used for outer loops (in VPlan-native path)");
1085 
1086     auto Scalars = InstsToScalarize.find(VF);
1087     assert(Scalars != InstsToScalarize.end() &&
1088            "VF not yet analyzed for scalarization profitability");
1089     return Scalars->second.contains(I);
1090   }
1091 
1092   /// Returns true if \p I is known to be uniform after vectorization.
1093   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1094     assert(
1095         TheLoop->isInnermost() &&
1096         "cost-model should not be used for outer loops (in VPlan-native path)");
1097     // Pseudo probe needs to be duplicated for each unrolled iteration and
1098     // vector lane so that profiled loop trip count can be accurately
1099     // accumulated instead of being under counted.
1100     if (isa<PseudoProbeInst>(I))
1101       return false;
1102 
1103     if (VF.isScalar())
1104       return true;
1105 
1106     auto UniformsPerVF = Uniforms.find(VF);
1107     assert(UniformsPerVF != Uniforms.end() &&
1108            "VF not yet analyzed for uniformity");
1109     return UniformsPerVF->second.count(I);
1110   }
1111 
1112   /// Returns true if \p I is known to be scalar after vectorization.
1113   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1114     assert(
1115         TheLoop->isInnermost() &&
1116         "cost-model should not be used for outer loops (in VPlan-native path)");
1117     if (VF.isScalar())
1118       return true;
1119 
1120     auto ScalarsPerVF = Scalars.find(VF);
1121     assert(ScalarsPerVF != Scalars.end() &&
1122            "Scalar values are not calculated for VF");
1123     return ScalarsPerVF->second.count(I);
1124   }
1125 
1126   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1127   /// for vectorization factor \p VF.
1128   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1129     return VF.isVector() && MinBWs.contains(I) &&
1130            !isProfitableToScalarize(I, VF) &&
1131            !isScalarAfterVectorization(I, VF);
1132   }
1133 
1134   /// Decision that was taken during cost calculation for memory instruction.
1135   enum InstWidening {
1136     CM_Unknown,
1137     CM_Widen,         // For consecutive accesses with stride +1.
1138     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1139     CM_Interleave,
1140     CM_GatherScatter,
1141     CM_Scalarize,
1142     CM_VectorCall,
1143     CM_IntrinsicCall
1144   };
1145 
1146   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1147   /// instruction \p I and vector width \p VF.
1148   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1149                            InstructionCost Cost) {
1150     assert(VF.isVector() && "Expected VF >=2");
1151     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1152   }
1153 
1154   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1155   /// interleaving group \p Grp and vector width \p VF.
1156   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1157                            ElementCount VF, InstWidening W,
1158                            InstructionCost Cost) {
1159     assert(VF.isVector() && "Expected VF >=2");
1160     /// Broadcast this decicion to all instructions inside the group.
1161     /// When interleaving, the cost will only be assigned one instruction, the
1162     /// insert position. For other cases, add the appropriate fraction of the
1163     /// total cost to each instruction. This ensures accurate costs are used,
1164     /// even if the insert position instruction is not used.
1165     InstructionCost InsertPosCost = Cost;
1166     InstructionCost OtherMemberCost = 0;
1167     if (W != CM_Interleave)
1168       OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1169     ;
1170     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1171       if (auto *I = Grp->getMember(Idx)) {
1172         if (Grp->getInsertPos() == I)
1173           WideningDecisions[std::make_pair(I, VF)] =
1174               std::make_pair(W, InsertPosCost);
1175         else
1176           WideningDecisions[std::make_pair(I, VF)] =
1177               std::make_pair(W, OtherMemberCost);
1178       }
1179     }
1180   }
1181 
1182   /// Return the cost model decision for the given instruction \p I and vector
1183   /// width \p VF. Return CM_Unknown if this instruction did not pass
1184   /// through the cost modeling.
1185   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1186     assert(VF.isVector() && "Expected VF to be a vector VF");
1187     assert(
1188         TheLoop->isInnermost() &&
1189         "cost-model should not be used for outer loops (in VPlan-native path)");
1190 
1191     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1192     auto Itr = WideningDecisions.find(InstOnVF);
1193     if (Itr == WideningDecisions.end())
1194       return CM_Unknown;
1195     return Itr->second.first;
1196   }
1197 
1198   /// Return the vectorization cost for the given instruction \p I and vector
1199   /// width \p VF.
1200   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1201     assert(VF.isVector() && "Expected VF >=2");
1202     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1203     assert(WideningDecisions.contains(InstOnVF) &&
1204            "The cost is not calculated");
1205     return WideningDecisions[InstOnVF].second;
1206   }
1207 
1208   struct CallWideningDecision {
1209     InstWidening Kind;
1210     Function *Variant;
1211     Intrinsic::ID IID;
1212     std::optional<unsigned> MaskPos;
1213     InstructionCost Cost;
1214   };
1215 
1216   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1217                                Function *Variant, Intrinsic::ID IID,
1218                                std::optional<unsigned> MaskPos,
1219                                InstructionCost Cost) {
1220     assert(!VF.isScalar() && "Expected vector VF");
1221     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1222                                                      MaskPos, Cost};
1223   }
1224 
1225   CallWideningDecision getCallWideningDecision(CallInst *CI,
1226                                                ElementCount VF) const {
1227     assert(!VF.isScalar() && "Expected vector VF");
1228     return CallWideningDecisions.at(std::make_pair(CI, VF));
1229   }
1230 
1231   /// Return True if instruction \p I is an optimizable truncate whose operand
1232   /// is an induction variable. Such a truncate will be removed by adding a new
1233   /// induction variable with the destination type.
1234   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1235     // If the instruction is not a truncate, return false.
1236     auto *Trunc = dyn_cast<TruncInst>(I);
1237     if (!Trunc)
1238       return false;
1239 
1240     // Get the source and destination types of the truncate.
1241     Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1242     Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1243 
1244     // If the truncate is free for the given types, return false. Replacing a
1245     // free truncate with an induction variable would add an induction variable
1246     // update instruction to each iteration of the loop. We exclude from this
1247     // check the primary induction variable since it will need an update
1248     // instruction regardless.
1249     Value *Op = Trunc->getOperand(0);
1250     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1251       return false;
1252 
1253     // If the truncated value is not an induction variable, return false.
1254     return Legal->isInductionPhi(Op);
1255   }
1256 
1257   /// Collects the instructions to scalarize for each predicated instruction in
1258   /// the loop.
1259   void collectInstsToScalarize(ElementCount VF);
1260 
1261   /// Collect Uniform and Scalar values for the given \p VF.
1262   /// The sets depend on CM decision for Load/Store instructions
1263   /// that may be vectorized as interleave, gather-scatter or scalarized.
1264   /// Also make a decision on what to do about call instructions in the loop
1265   /// at that VF -- scalarize, call a known vector routine, or call a
1266   /// vector intrinsic.
1267   void collectUniformsAndScalars(ElementCount VF) {
1268     // Do the analysis once.
1269     if (VF.isScalar() || Uniforms.contains(VF))
1270       return;
1271     setCostBasedWideningDecision(VF);
1272     collectLoopUniforms(VF);
1273     setVectorizedCallDecision(VF);
1274     collectLoopScalars(VF);
1275   }
1276 
1277   /// Returns true if the target machine supports masked store operation
1278   /// for the given \p DataType and kind of access to \p Ptr.
1279   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1280     return Legal->isConsecutivePtr(DataType, Ptr) &&
1281            TTI.isLegalMaskedStore(DataType, Alignment);
1282   }
1283 
1284   /// Returns true if the target machine supports masked load operation
1285   /// for the given \p DataType and kind of access to \p Ptr.
1286   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1287     return Legal->isConsecutivePtr(DataType, Ptr) &&
1288            TTI.isLegalMaskedLoad(DataType, Alignment);
1289   }
1290 
1291   /// Returns true if the target machine can represent \p V as a masked gather
1292   /// or scatter operation.
1293   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1294     bool LI = isa<LoadInst>(V);
1295     bool SI = isa<StoreInst>(V);
1296     if (!LI && !SI)
1297       return false;
1298     auto *Ty = getLoadStoreType(V);
1299     Align Align = getLoadStoreAlignment(V);
1300     if (VF.isVector())
1301       Ty = VectorType::get(Ty, VF);
1302     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1303            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1304   }
1305 
1306   /// Returns true if the target machine supports all of the reduction
1307   /// variables found for the given VF.
1308   bool canVectorizeReductions(ElementCount VF) const {
1309     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1310       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1311       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1312     }));
1313   }
1314 
1315   /// Given costs for both strategies, return true if the scalar predication
1316   /// lowering should be used for div/rem.  This incorporates an override
1317   /// option so it is not simply a cost comparison.
1318   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1319                                      InstructionCost SafeDivisorCost) const {
1320     switch (ForceSafeDivisor) {
1321     case cl::BOU_UNSET:
1322       return ScalarCost < SafeDivisorCost;
1323     case cl::BOU_TRUE:
1324       return false;
1325     case cl::BOU_FALSE:
1326       return true;
1327     }
1328     llvm_unreachable("impossible case value");
1329   }
1330 
1331   /// Returns true if \p I is an instruction which requires predication and
1332   /// for which our chosen predication strategy is scalarization (i.e. we
1333   /// don't have an alternate strategy such as masking available).
1334   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1335   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1336 
1337   /// Returns true if \p I is an instruction that needs to be predicated
1338   /// at runtime.  The result is independent of the predication mechanism.
1339   /// Superset of instructions that return true for isScalarWithPredication.
1340   bool isPredicatedInst(Instruction *I) const;
1341 
1342   /// Return the costs for our two available strategies for lowering a
1343   /// div/rem operation which requires speculating at least one lane.
1344   /// First result is for scalarization (will be invalid for scalable
1345   /// vectors); second is for the safe-divisor strategy.
1346   std::pair<InstructionCost, InstructionCost>
1347   getDivRemSpeculationCost(Instruction *I,
1348                            ElementCount VF) const;
1349 
1350   /// Returns true if \p I is a memory instruction with consecutive memory
1351   /// access that can be widened.
1352   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1353 
1354   /// Returns true if \p I is a memory instruction in an interleaved-group
1355   /// of memory accesses that can be vectorized with wide vector loads/stores
1356   /// and shuffles.
1357   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1358 
1359   /// Check if \p Instr belongs to any interleaved access group.
1360   bool isAccessInterleaved(Instruction *Instr) const {
1361     return InterleaveInfo.isInterleaved(Instr);
1362   }
1363 
1364   /// Get the interleaved access group that \p Instr belongs to.
1365   const InterleaveGroup<Instruction> *
1366   getInterleavedAccessGroup(Instruction *Instr) const {
1367     return InterleaveInfo.getInterleaveGroup(Instr);
1368   }
1369 
1370   /// Returns true if we're required to use a scalar epilogue for at least
1371   /// the final iteration of the original loop.
1372   bool requiresScalarEpilogue(bool IsVectorizing) const {
1373     if (!isScalarEpilogueAllowed()) {
1374       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1375       return false;
1376     }
1377     // If we might exit from anywhere but the latch and early exit vectorization
1378     // is disabled, we must run the exiting iteration in scalar form.
1379     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1380         !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1381       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1382                            "from latch block\n");
1383       return true;
1384     }
1385     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1386       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1387                            "interleaved group requires scalar epilogue\n");
1388       return true;
1389     }
1390     LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1391     return false;
1392   }
1393 
1394   /// Returns true if we're required to use a scalar epilogue for at least
1395   /// the final iteration of the original loop for all VFs in \p Range.
1396   /// A scalar epilogue must either be required for all VFs in \p Range or for
1397   /// none.
1398   bool requiresScalarEpilogue(VFRange Range) const {
1399     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1400       return requiresScalarEpilogue(VF.isVector());
1401     };
1402     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1403     assert(
1404         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1405         "all VFs in range must agree on whether a scalar epilogue is required");
1406     return IsRequired;
1407   }
1408 
1409   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1410   /// loop hint annotation.
1411   bool isScalarEpilogueAllowed() const {
1412     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1413   }
1414 
1415   /// Returns the TailFoldingStyle that is best for the current loop.
1416   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1417     if (!ChosenTailFoldingStyle)
1418       return TailFoldingStyle::None;
1419     return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1420                                : ChosenTailFoldingStyle->second;
1421   }
1422 
1423   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1424   /// overflow or not.
1425   /// \param IsScalableVF true if scalable vector factors enabled.
1426   /// \param UserIC User specific interleave count.
1427   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1428     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1429     if (!Legal->canFoldTailByMasking()) {
1430       ChosenTailFoldingStyle =
1431           std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1432       return;
1433     }
1434 
1435     if (!ForceTailFoldingStyle.getNumOccurrences()) {
1436       ChosenTailFoldingStyle = std::make_pair(
1437           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1438           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1439       return;
1440     }
1441 
1442     // Set styles when forced.
1443     ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1444                                             ForceTailFoldingStyle.getValue());
1445     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1446       return;
1447     // Override forced styles if needed.
1448     // FIXME: use actual opcode/data type for analysis here.
1449     // FIXME: Investigate opportunity for fixed vector factor.
1450     bool EVLIsLegal = UserIC <= 1 &&
1451                       TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1452                       !EnableVPlanNativePath;
1453     if (!EVLIsLegal) {
1454       // If for some reason EVL mode is unsupported, fallback to
1455       // DataWithoutLaneMask to try to vectorize the loop with folded tail
1456       // in a generic way.
1457       ChosenTailFoldingStyle =
1458           std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1459                          TailFoldingStyle::DataWithoutLaneMask);
1460       LLVM_DEBUG(
1461           dbgs()
1462           << "LV: Preference for VP intrinsics indicated. Will "
1463              "not try to generate VP Intrinsics "
1464           << (UserIC > 1
1465                   ? "since interleave count specified is greater than 1.\n"
1466                   : "due to non-interleaving reasons.\n"));
1467     }
1468   }
1469 
1470   /// Returns true if all loop blocks should be masked to fold tail loop.
1471   bool foldTailByMasking() const {
1472     // TODO: check if it is possible to check for None style independent of
1473     // IVUpdateMayOverflow flag in getTailFoldingStyle.
1474     return getTailFoldingStyle() != TailFoldingStyle::None;
1475   }
1476 
1477   /// Return maximum safe number of elements to be processed per vector
1478   /// iteration, which do not prevent store-load forwarding and are safe with
1479   /// regard to the memory dependencies. Required for EVL-based VPlans to
1480   /// correctly calculate AVL (application vector length) as min(remaining AVL,
1481   /// MaxSafeElements).
1482   /// TODO: need to consider adjusting cost model to use this value as a
1483   /// vectorization factor for EVL-based vectorization.
1484   std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1485 
1486   /// Returns true if the instructions in this block requires predication
1487   /// for any reason, e.g. because tail folding now requires a predicate
1488   /// or because the block in the original loop was predicated.
1489   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1490     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1491   }
1492 
1493   /// Returns true if VP intrinsics with explicit vector length support should
1494   /// be generated in the tail folded loop.
1495   bool foldTailWithEVL() const {
1496     return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1497   }
1498 
1499   /// Returns true if the Phi is part of an inloop reduction.
1500   bool isInLoopReduction(PHINode *Phi) const {
1501     return InLoopReductions.contains(Phi);
1502   }
1503 
1504   /// Returns true if the predicated reduction select should be used to set the
1505   /// incoming value for the reduction phi.
1506   bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1507     // Force to use predicated reduction select since the EVL of the
1508     // second-to-last iteration might not be VF*UF.
1509     if (foldTailWithEVL())
1510       return true;
1511     return PreferPredicatedReductionSelect ||
1512            TTI.preferPredicatedReductionSelect(
1513                Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1514   }
1515 
1516   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1517   /// with factor VF.  Return the cost of the instruction, including
1518   /// scalarization overhead if it's needed.
1519   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1520 
1521   /// Estimate cost of a call instruction CI if it were vectorized with factor
1522   /// VF. Return the cost of the instruction, including scalarization overhead
1523   /// if it's needed.
1524   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1525 
1526   /// Invalidates decisions already taken by the cost model.
1527   void invalidateCostModelingDecisions() {
1528     WideningDecisions.clear();
1529     CallWideningDecisions.clear();
1530     Uniforms.clear();
1531     Scalars.clear();
1532   }
1533 
1534   /// Returns the expected execution cost. The unit of the cost does
1535   /// not matter because we use the 'cost' units to compare different
1536   /// vector widths. The cost that is returned is *not* normalized by
1537   /// the factor width.
1538   InstructionCost expectedCost(ElementCount VF);
1539 
1540   bool hasPredStores() const { return NumPredStores > 0; }
1541 
1542   /// Returns true if epilogue vectorization is considered profitable, and
1543   /// false otherwise.
1544   /// \p VF is the vectorization factor chosen for the original loop.
1545   /// \p Multiplier is an aditional scaling factor applied to VF before
1546   /// comparing to EpilogueVectorizationMinVF.
1547   bool isEpilogueVectorizationProfitable(const ElementCount VF,
1548                                          const unsigned IC) const;
1549 
1550   /// Returns the execution time cost of an instruction for a given vector
1551   /// width. Vector width of one means scalar.
1552   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1553 
1554   /// Return the cost of instructions in an inloop reduction pattern, if I is
1555   /// part of that pattern.
1556   std::optional<InstructionCost>
1557   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1558                           TTI::TargetCostKind CostKind) const;
1559 
1560   /// Returns true if \p Op should be considered invariant and if it is
1561   /// trivially hoistable.
1562   bool shouldConsiderInvariant(Value *Op);
1563 
1564 private:
1565   unsigned NumPredStores = 0;
1566 
1567   /// \return An upper bound for the vectorization factors for both
1568   /// fixed and scalable vectorization, where the minimum-known number of
1569   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1570   /// disabled or unsupported, then the scalable part will be equal to
1571   /// ElementCount::getScalable(0).
1572   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1573                                            ElementCount UserVF,
1574                                            bool FoldTailByMasking);
1575 
1576   /// \return the maximized element count based on the targets vector
1577   /// registers and the loop trip-count, but limited to a maximum safe VF.
1578   /// This is a helper function of computeFeasibleMaxVF.
1579   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1580                                        unsigned SmallestType,
1581                                        unsigned WidestType,
1582                                        ElementCount MaxSafeVF,
1583                                        bool FoldTailByMasking);
1584 
1585   /// Checks if scalable vectorization is supported and enabled. Caches the
1586   /// result to avoid repeated debug dumps for repeated queries.
1587   bool isScalableVectorizationAllowed();
1588 
1589   /// \return the maximum legal scalable VF, based on the safe max number
1590   /// of elements.
1591   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1592 
1593   /// Calculate vectorization cost of memory instruction \p I.
1594   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1595 
1596   /// The cost computation for scalarized memory instruction.
1597   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1598 
1599   /// The cost computation for interleaving group of memory instructions.
1600   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1601 
1602   /// The cost computation for Gather/Scatter instruction.
1603   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1604 
1605   /// The cost computation for widening instruction \p I with consecutive
1606   /// memory access.
1607   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1608 
1609   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1610   /// Load: scalar load + broadcast.
1611   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1612   /// element)
1613   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1614 
1615   /// Estimate the overhead of scalarizing an instruction. This is a
1616   /// convenience wrapper for the type-based getScalarizationOverhead API.
1617   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1618                                            TTI::TargetCostKind CostKind) const;
1619 
1620   /// Returns true if an artificially high cost for emulated masked memrefs
1621   /// should be used.
1622   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1623 
1624   /// Map of scalar integer values to the smallest bitwidth they can be legally
1625   /// represented as. The vector equivalents of these values should be truncated
1626   /// to this type.
1627   MapVector<Instruction *, uint64_t> MinBWs;
1628 
1629   /// A type representing the costs for instructions if they were to be
1630   /// scalarized rather than vectorized. The entries are Instruction-Cost
1631   /// pairs.
1632   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1633 
1634   /// A set containing all BasicBlocks that are known to present after
1635   /// vectorization as a predicated block.
1636   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1637       PredicatedBBsAfterVectorization;
1638 
1639   /// Records whether it is allowed to have the original scalar loop execute at
1640   /// least once. This may be needed as a fallback loop in case runtime
1641   /// aliasing/dependence checks fail, or to handle the tail/remainder
1642   /// iterations when the trip count is unknown or doesn't divide by the VF,
1643   /// or as a peel-loop to handle gaps in interleave-groups.
1644   /// Under optsize and when the trip count is very small we don't allow any
1645   /// iterations to execute in the scalar loop.
1646   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1647 
1648   /// Control finally chosen tail folding style. The first element is used if
1649   /// the IV update may overflow, the second element - if it does not.
1650   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1651       ChosenTailFoldingStyle;
1652 
1653   /// true if scalable vectorization is supported and enabled.
1654   std::optional<bool> IsScalableVectorizationAllowed;
1655 
1656   /// Maximum safe number of elements to be processed per vector iteration,
1657   /// which do not prevent store-load forwarding and are safe with regard to the
1658   /// memory dependencies. Required for EVL-based veectorization, where this
1659   /// value is used as the upper bound of the safe AVL.
1660   std::optional<unsigned> MaxSafeElements;
1661 
1662   /// A map holding scalar costs for different vectorization factors. The
1663   /// presence of a cost for an instruction in the mapping indicates that the
1664   /// instruction will be scalarized when vectorizing with the associated
1665   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1666   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1667 
1668   /// Holds the instructions known to be uniform after vectorization.
1669   /// The data is collected per VF.
1670   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1671 
1672   /// Holds the instructions known to be scalar after vectorization.
1673   /// The data is collected per VF.
1674   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1675 
1676   /// Holds the instructions (address computations) that are forced to be
1677   /// scalarized.
1678   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1679 
1680   /// PHINodes of the reductions that should be expanded in-loop.
1681   SmallPtrSet<PHINode *, 4> InLoopReductions;
1682 
1683   /// A Map of inloop reduction operations and their immediate chain operand.
1684   /// FIXME: This can be removed once reductions can be costed correctly in
1685   /// VPlan. This was added to allow quick lookup of the inloop operations.
1686   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1687 
1688   /// Returns the expected difference in cost from scalarizing the expression
1689   /// feeding a predicated instruction \p PredInst. The instructions to
1690   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1691   /// non-negative return value implies the expression will be scalarized.
1692   /// Currently, only single-use chains are considered for scalarization.
1693   InstructionCost computePredInstDiscount(Instruction *PredInst,
1694                                           ScalarCostsTy &ScalarCosts,
1695                                           ElementCount VF);
1696 
1697   /// Collect the instructions that are uniform after vectorization. An
1698   /// instruction is uniform if we represent it with a single scalar value in
1699   /// the vectorized loop corresponding to each vector iteration. Examples of
1700   /// uniform instructions include pointer operands of consecutive or
1701   /// interleaved memory accesses. Note that although uniformity implies an
1702   /// instruction will be scalar, the reverse is not true. In general, a
1703   /// scalarized instruction will be represented by VF scalar values in the
1704   /// vectorized loop, each corresponding to an iteration of the original
1705   /// scalar loop.
1706   void collectLoopUniforms(ElementCount VF);
1707 
1708   /// Collect the instructions that are scalar after vectorization. An
1709   /// instruction is scalar if it is known to be uniform or will be scalarized
1710   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1711   /// to the list if they are used by a load/store instruction that is marked as
1712   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1713   /// VF values in the vectorized loop, each corresponding to an iteration of
1714   /// the original scalar loop.
1715   void collectLoopScalars(ElementCount VF);
1716 
1717   /// Keeps cost model vectorization decision and cost for instructions.
1718   /// Right now it is used for memory instructions only.
1719   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1720                                 std::pair<InstWidening, InstructionCost>>;
1721 
1722   DecisionList WideningDecisions;
1723 
1724   using CallDecisionList =
1725       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1726 
1727   CallDecisionList CallWideningDecisions;
1728 
1729   /// Returns true if \p V is expected to be vectorized and it needs to be
1730   /// extracted.
1731   bool needsExtract(Value *V, ElementCount VF) const {
1732     Instruction *I = dyn_cast<Instruction>(V);
1733     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1734         TheLoop->isLoopInvariant(I) ||
1735         getWideningDecision(I, VF) == CM_Scalarize)
1736       return false;
1737 
1738     // Assume we can vectorize V (and hence we need extraction) if the
1739     // scalars are not computed yet. This can happen, because it is called
1740     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1741     // the scalars are collected. That should be a safe assumption in most
1742     // cases, because we check if the operands have vectorizable types
1743     // beforehand in LoopVectorizationLegality.
1744     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1745   };
1746 
1747   /// Returns a range containing only operands needing to be extracted.
1748   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1749                                                    ElementCount VF) const {
1750     return SmallVector<Value *, 4>(make_filter_range(
1751         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1752   }
1753 
1754 public:
1755   /// The loop that we evaluate.
1756   Loop *TheLoop;
1757 
1758   /// Predicated scalar evolution analysis.
1759   PredicatedScalarEvolution &PSE;
1760 
1761   /// Loop Info analysis.
1762   LoopInfo *LI;
1763 
1764   /// Vectorization legality.
1765   LoopVectorizationLegality *Legal;
1766 
1767   /// Vector target information.
1768   const TargetTransformInfo &TTI;
1769 
1770   /// Target Library Info.
1771   const TargetLibraryInfo *TLI;
1772 
1773   /// Demanded bits analysis.
1774   DemandedBits *DB;
1775 
1776   /// Assumption cache.
1777   AssumptionCache *AC;
1778 
1779   /// Interface to emit optimization remarks.
1780   OptimizationRemarkEmitter *ORE;
1781 
1782   const Function *TheFunction;
1783 
1784   /// Loop Vectorize Hint.
1785   const LoopVectorizeHints *Hints;
1786 
1787   /// The interleave access information contains groups of interleaved accesses
1788   /// with the same stride and close to each other.
1789   InterleavedAccessInfo &InterleaveInfo;
1790 
1791   /// Values to ignore in the cost model.
1792   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1793 
1794   /// Values to ignore in the cost model when VF > 1.
1795   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1796 
1797   /// All element types found in the loop.
1798   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1799 };
1800 } // end namespace llvm
1801 
1802 namespace {
1803 /// Helper struct to manage generating runtime checks for vectorization.
1804 ///
1805 /// The runtime checks are created up-front in temporary blocks to allow better
1806 /// estimating the cost and un-linked from the existing IR. After deciding to
1807 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1808 /// temporary blocks are completely removed.
1809 class GeneratedRTChecks {
1810   /// Basic block which contains the generated SCEV checks, if any.
1811   BasicBlock *SCEVCheckBlock = nullptr;
1812 
1813   /// The value representing the result of the generated SCEV checks. If it is
1814   /// nullptr, either no SCEV checks have been generated or they have been used.
1815   Value *SCEVCheckCond = nullptr;
1816 
1817   /// Basic block which contains the generated memory runtime checks, if any.
1818   BasicBlock *MemCheckBlock = nullptr;
1819 
1820   /// The value representing the result of the generated memory runtime checks.
1821   /// If it is nullptr, either no memory runtime checks have been generated or
1822   /// they have been used.
1823   Value *MemRuntimeCheckCond = nullptr;
1824 
1825   DominatorTree *DT;
1826   LoopInfo *LI;
1827   TargetTransformInfo *TTI;
1828 
1829   SCEVExpander SCEVExp;
1830   SCEVExpander MemCheckExp;
1831 
1832   bool CostTooHigh = false;
1833   const bool AddBranchWeights;
1834 
1835   Loop *OuterLoop = nullptr;
1836 
1837   PredicatedScalarEvolution &PSE;
1838 
1839 public:
1840   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1841                     LoopInfo *LI, TargetTransformInfo *TTI,
1842                     const DataLayout &DL, bool AddBranchWeights)
1843       : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1844         MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1845         AddBranchWeights(AddBranchWeights), PSE(PSE) {}
1846 
1847   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1848   /// accurately estimate the cost of the runtime checks. The blocks are
1849   /// un-linked from the IR and are added back during vector code generation. If
1850   /// there is no vector code generation, the check blocks are removed
1851   /// completely.
1852   void create(Loop *L, const LoopAccessInfo &LAI,
1853               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1854 
1855     // Hard cutoff to limit compile-time increase in case a very large number of
1856     // runtime checks needs to be generated.
1857     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1858     // profile info.
1859     CostTooHigh =
1860         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1861     if (CostTooHigh)
1862       return;
1863 
1864     BasicBlock *LoopHeader = L->getHeader();
1865     BasicBlock *Preheader = L->getLoopPreheader();
1866 
1867     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1868     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1869     // may be used by SCEVExpander. The blocks will be un-linked from their
1870     // predecessors and removed from LI & DT at the end of the function.
1871     if (!UnionPred.isAlwaysTrue()) {
1872       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1873                                   nullptr, "vector.scevcheck");
1874 
1875       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1876           &UnionPred, SCEVCheckBlock->getTerminator());
1877     }
1878 
1879     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1880     if (RtPtrChecking.Need) {
1881       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1882       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1883                                  "vector.memcheck");
1884 
1885       auto DiffChecks = RtPtrChecking.getDiffChecks();
1886       if (DiffChecks) {
1887         Value *RuntimeVF = nullptr;
1888         MemRuntimeCheckCond = addDiffRuntimeChecks(
1889             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1890             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1891               if (!RuntimeVF)
1892                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1893               return RuntimeVF;
1894             },
1895             IC);
1896       } else {
1897         MemRuntimeCheckCond = addRuntimeChecks(
1898             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1899             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1900       }
1901       assert(MemRuntimeCheckCond &&
1902              "no RT checks generated although RtPtrChecking "
1903              "claimed checks are required");
1904     }
1905 
1906     if (!MemCheckBlock && !SCEVCheckBlock)
1907       return;
1908 
1909     // Unhook the temporary block with the checks, update various places
1910     // accordingly.
1911     if (SCEVCheckBlock)
1912       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1913     if (MemCheckBlock)
1914       MemCheckBlock->replaceAllUsesWith(Preheader);
1915 
1916     if (SCEVCheckBlock) {
1917       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1918       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1919       Preheader->getTerminator()->eraseFromParent();
1920     }
1921     if (MemCheckBlock) {
1922       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1923       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1924       Preheader->getTerminator()->eraseFromParent();
1925     }
1926 
1927     DT->changeImmediateDominator(LoopHeader, Preheader);
1928     if (MemCheckBlock) {
1929       DT->eraseNode(MemCheckBlock);
1930       LI->removeBlock(MemCheckBlock);
1931     }
1932     if (SCEVCheckBlock) {
1933       DT->eraseNode(SCEVCheckBlock);
1934       LI->removeBlock(SCEVCheckBlock);
1935     }
1936 
1937     // Outer loop is used as part of the later cost calculations.
1938     OuterLoop = L->getParentLoop();
1939   }
1940 
1941   InstructionCost getCost() {
1942     if (SCEVCheckBlock || MemCheckBlock)
1943       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1944 
1945     if (CostTooHigh) {
1946       InstructionCost Cost;
1947       Cost.setInvalid();
1948       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1949       return Cost;
1950     }
1951 
1952     InstructionCost RTCheckCost = 0;
1953     if (SCEVCheckBlock)
1954       for (Instruction &I : *SCEVCheckBlock) {
1955         if (SCEVCheckBlock->getTerminator() == &I)
1956           continue;
1957         InstructionCost C =
1958             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1959         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1960         RTCheckCost += C;
1961       }
1962     if (MemCheckBlock) {
1963       InstructionCost MemCheckCost = 0;
1964       for (Instruction &I : *MemCheckBlock) {
1965         if (MemCheckBlock->getTerminator() == &I)
1966           continue;
1967         InstructionCost C =
1968             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1969         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1970         MemCheckCost += C;
1971       }
1972 
1973       // If the runtime memory checks are being created inside an outer loop
1974       // we should find out if these checks are outer loop invariant. If so,
1975       // the checks will likely be hoisted out and so the effective cost will
1976       // reduce according to the outer loop trip count.
1977       if (OuterLoop) {
1978         ScalarEvolution *SE = MemCheckExp.getSE();
1979         // TODO: If profitable, we could refine this further by analysing every
1980         // individual memory check, since there could be a mixture of loop
1981         // variant and invariant checks that mean the final condition is
1982         // variant.
1983         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1984         if (SE->isLoopInvariant(Cond, OuterLoop)) {
1985           // It seems reasonable to assume that we can reduce the effective
1986           // cost of the checks even when we know nothing about the trip
1987           // count. Assume that the outer loop executes at least twice.
1988           unsigned BestTripCount = 2;
1989 
1990           // Get the best known TC estimate.
1991           if (auto EstimatedTC = getSmallBestKnownTC(
1992                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
1993             BestTripCount = *EstimatedTC;
1994 
1995           BestTripCount = std::max(BestTripCount, 1U);
1996           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1997 
1998           // Let's ensure the cost is always at least 1.
1999           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2000                                      (InstructionCost::CostType)1);
2001 
2002           if (BestTripCount > 1)
2003             LLVM_DEBUG(dbgs()
2004                        << "We expect runtime memory checks to be hoisted "
2005                        << "out of the outer loop. Cost reduced from "
2006                        << MemCheckCost << " to " << NewMemCheckCost << '\n');
2007 
2008           MemCheckCost = NewMemCheckCost;
2009         }
2010       }
2011 
2012       RTCheckCost += MemCheckCost;
2013     }
2014 
2015     if (SCEVCheckBlock || MemCheckBlock)
2016       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2017                         << "\n");
2018 
2019     return RTCheckCost;
2020   }
2021 
2022   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2023   /// unused.
2024   ~GeneratedRTChecks() {
2025     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2026     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2027     if (!SCEVCheckCond)
2028       SCEVCleaner.markResultUsed();
2029 
2030     if (!MemRuntimeCheckCond)
2031       MemCheckCleaner.markResultUsed();
2032 
2033     if (MemRuntimeCheckCond) {
2034       auto &SE = *MemCheckExp.getSE();
2035       // Memory runtime check generation creates compares that use expanded
2036       // values. Remove them before running the SCEVExpanderCleaners.
2037       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2038         if (MemCheckExp.isInsertedInstruction(&I))
2039           continue;
2040         SE.forgetValue(&I);
2041         I.eraseFromParent();
2042       }
2043     }
2044     MemCheckCleaner.cleanup();
2045     SCEVCleaner.cleanup();
2046 
2047     if (SCEVCheckCond)
2048       SCEVCheckBlock->eraseFromParent();
2049     if (MemRuntimeCheckCond)
2050       MemCheckBlock->eraseFromParent();
2051   }
2052 
2053   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2054   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2055   /// depending on the generated condition.
2056   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2057                              BasicBlock *LoopVectorPreHeader) {
2058     if (!SCEVCheckCond)
2059       return nullptr;
2060 
2061     Value *Cond = SCEVCheckCond;
2062     // Mark the check as used, to prevent it from being removed during cleanup.
2063     SCEVCheckCond = nullptr;
2064     if (auto *C = dyn_cast<ConstantInt>(Cond))
2065       if (C->isZero())
2066         return nullptr;
2067 
2068     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2069 
2070     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2071     // Create new preheader for vector loop.
2072     if (OuterLoop)
2073       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2074 
2075     SCEVCheckBlock->getTerminator()->eraseFromParent();
2076     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2077     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2078                                                 SCEVCheckBlock);
2079 
2080     DT->addNewBlock(SCEVCheckBlock, Pred);
2081     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2082 
2083     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2084     if (AddBranchWeights)
2085       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2086     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2087     return SCEVCheckBlock;
2088   }
2089 
2090   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2091   /// the branches to branch to the vector preheader or \p Bypass, depending on
2092   /// the generated condition.
2093   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2094                                    BasicBlock *LoopVectorPreHeader) {
2095     // Check if we generated code that checks in runtime if arrays overlap.
2096     if (!MemRuntimeCheckCond)
2097       return nullptr;
2098 
2099     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2100     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2101                                                 MemCheckBlock);
2102 
2103     DT->addNewBlock(MemCheckBlock, Pred);
2104     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2105     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2106 
2107     if (OuterLoop)
2108       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2109 
2110     BranchInst &BI =
2111         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2112     if (AddBranchWeights) {
2113       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2114     }
2115     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2116     MemCheckBlock->getTerminator()->setDebugLoc(
2117         Pred->getTerminator()->getDebugLoc());
2118 
2119     // Mark the check as used, to prevent it from being removed during cleanup.
2120     MemRuntimeCheckCond = nullptr;
2121     return MemCheckBlock;
2122   }
2123 };
2124 } // namespace
2125 
2126 static bool useActiveLaneMask(TailFoldingStyle Style) {
2127   return Style == TailFoldingStyle::Data ||
2128          Style == TailFoldingStyle::DataAndControlFlow ||
2129          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2130 }
2131 
2132 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2133   return Style == TailFoldingStyle::DataAndControlFlow ||
2134          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2135 }
2136 
2137 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2138 // vectorization. The loop needs to be annotated with #pragma omp simd
2139 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2140 // vector length information is not provided, vectorization is not considered
2141 // explicit. Interleave hints are not allowed either. These limitations will be
2142 // relaxed in the future.
2143 // Please, note that we are currently forced to abuse the pragma 'clang
2144 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2145 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2146 // provides *explicit vectorization hints* (LV can bypass legal checks and
2147 // assume that vectorization is legal). However, both hints are implemented
2148 // using the same metadata (llvm.loop.vectorize, processed by
2149 // LoopVectorizeHints). This will be fixed in the future when the native IR
2150 // representation for pragma 'omp simd' is introduced.
2151 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2152                                    OptimizationRemarkEmitter *ORE) {
2153   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2154   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2155 
2156   // Only outer loops with an explicit vectorization hint are supported.
2157   // Unannotated outer loops are ignored.
2158   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2159     return false;
2160 
2161   Function *Fn = OuterLp->getHeader()->getParent();
2162   if (!Hints.allowVectorization(Fn, OuterLp,
2163                                 true /*VectorizeOnlyWhenForced*/)) {
2164     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2165     return false;
2166   }
2167 
2168   if (Hints.getInterleave() > 1) {
2169     // TODO: Interleave support is future work.
2170     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2171                          "outer loops.\n");
2172     Hints.emitRemarkWithHints();
2173     return false;
2174   }
2175 
2176   return true;
2177 }
2178 
2179 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2180                                   OptimizationRemarkEmitter *ORE,
2181                                   SmallVectorImpl<Loop *> &V) {
2182   // Collect inner loops and outer loops without irreducible control flow. For
2183   // now, only collect outer loops that have explicit vectorization hints. If we
2184   // are stress testing the VPlan H-CFG construction, we collect the outermost
2185   // loop of every loop nest.
2186   if (L.isInnermost() || VPlanBuildStressTest ||
2187       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2188     LoopBlocksRPO RPOT(&L);
2189     RPOT.perform(LI);
2190     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2191       V.push_back(&L);
2192       // TODO: Collect inner loops inside marked outer loops in case
2193       // vectorization fails for the outer loop. Do not invoke
2194       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2195       // already known to be reducible. We can use an inherited attribute for
2196       // that.
2197       return;
2198     }
2199   }
2200   for (Loop *InnerL : L)
2201     collectSupportedLoops(*InnerL, LI, ORE, V);
2202 }
2203 
2204 //===----------------------------------------------------------------------===//
2205 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2206 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2207 //===----------------------------------------------------------------------===//
2208 
2209 /// Compute the transformed value of Index at offset StartValue using step
2210 /// StepValue.
2211 /// For integer induction, returns StartValue + Index * StepValue.
2212 /// For pointer induction, returns StartValue[Index * StepValue].
2213 /// FIXME: The newly created binary instructions should contain nsw/nuw
2214 /// flags, which can be found from the original scalar operations.
2215 static Value *
2216 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2217                      Value *Step,
2218                      InductionDescriptor::InductionKind InductionKind,
2219                      const BinaryOperator *InductionBinOp) {
2220   Type *StepTy = Step->getType();
2221   Value *CastedIndex = StepTy->isIntegerTy()
2222                            ? B.CreateSExtOrTrunc(Index, StepTy)
2223                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2224   if (CastedIndex != Index) {
2225     CastedIndex->setName(CastedIndex->getName() + ".cast");
2226     Index = CastedIndex;
2227   }
2228 
2229   // Note: the IR at this point is broken. We cannot use SE to create any new
2230   // SCEV and then expand it, hoping that SCEV's simplification will give us
2231   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2232   // lead to various SCEV crashes. So all we can do is to use builder and rely
2233   // on InstCombine for future simplifications. Here we handle some trivial
2234   // cases only.
2235   auto CreateAdd = [&B](Value *X, Value *Y) {
2236     assert(X->getType() == Y->getType() && "Types don't match!");
2237     if (auto *CX = dyn_cast<ConstantInt>(X))
2238       if (CX->isZero())
2239         return Y;
2240     if (auto *CY = dyn_cast<ConstantInt>(Y))
2241       if (CY->isZero())
2242         return X;
2243     return B.CreateAdd(X, Y);
2244   };
2245 
2246   // We allow X to be a vector type, in which case Y will potentially be
2247   // splatted into a vector with the same element count.
2248   auto CreateMul = [&B](Value *X, Value *Y) {
2249     assert(X->getType()->getScalarType() == Y->getType() &&
2250            "Types don't match!");
2251     if (auto *CX = dyn_cast<ConstantInt>(X))
2252       if (CX->isOne())
2253         return Y;
2254     if (auto *CY = dyn_cast<ConstantInt>(Y))
2255       if (CY->isOne())
2256         return X;
2257     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2258     if (XVTy && !isa<VectorType>(Y->getType()))
2259       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2260     return B.CreateMul(X, Y);
2261   };
2262 
2263   switch (InductionKind) {
2264   case InductionDescriptor::IK_IntInduction: {
2265     assert(!isa<VectorType>(Index->getType()) &&
2266            "Vector indices not supported for integer inductions yet");
2267     assert(Index->getType() == StartValue->getType() &&
2268            "Index type does not match StartValue type");
2269     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2270       return B.CreateSub(StartValue, Index);
2271     auto *Offset = CreateMul(Index, Step);
2272     return CreateAdd(StartValue, Offset);
2273   }
2274   case InductionDescriptor::IK_PtrInduction:
2275     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2276   case InductionDescriptor::IK_FpInduction: {
2277     assert(!isa<VectorType>(Index->getType()) &&
2278            "Vector indices not supported for FP inductions yet");
2279     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2280     assert(InductionBinOp &&
2281            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2282             InductionBinOp->getOpcode() == Instruction::FSub) &&
2283            "Original bin op should be defined for FP induction");
2284 
2285     Value *MulExp = B.CreateFMul(Step, Index);
2286     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2287                          "induction");
2288   }
2289   case InductionDescriptor::IK_NoInduction:
2290     return nullptr;
2291   }
2292   llvm_unreachable("invalid enum");
2293 }
2294 
2295 std::optional<unsigned> getMaxVScale(const Function &F,
2296                                      const TargetTransformInfo &TTI) {
2297   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2298     return MaxVScale;
2299 
2300   if (F.hasFnAttribute(Attribute::VScaleRange))
2301     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2302 
2303   return std::nullopt;
2304 }
2305 
2306 /// For the given VF and UF and maximum trip count computed for the loop, return
2307 /// whether the induction variable might overflow in the vectorized loop. If not,
2308 /// then we know a runtime overflow check always evaluates to false and can be
2309 /// removed.
2310 static bool isIndvarOverflowCheckKnownFalse(
2311     const LoopVectorizationCostModel *Cost,
2312     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2313   // Always be conservative if we don't know the exact unroll factor.
2314   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2315 
2316   Type *IdxTy = Cost->Legal->getWidestInductionType();
2317   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2318 
2319   // We know the runtime overflow check is known false iff the (max) trip-count
2320   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2321   // the vector loop induction variable.
2322   if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2323     uint64_t MaxVF = VF.getKnownMinValue();
2324     if (VF.isScalable()) {
2325       std::optional<unsigned> MaxVScale =
2326           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2327       if (!MaxVScale)
2328         return false;
2329       MaxVF *= *MaxVScale;
2330     }
2331 
2332     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2333   }
2334 
2335   return false;
2336 }
2337 
2338 // Return whether we allow using masked interleave-groups (for dealing with
2339 // strided loads/stores that reside in predicated blocks, or for dealing
2340 // with gaps).
2341 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2342   // If an override option has been passed in for interleaved accesses, use it.
2343   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2344     return EnableMaskedInterleavedMemAccesses;
2345 
2346   return TTI.enableMaskedInterleavedAccessVectorization();
2347 }
2348 
2349 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2350                                                VPReplicateRecipe *RepRecipe,
2351                                                const VPLane &Lane,
2352                                                VPTransformState &State) {
2353   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2354 
2355   // Does this instruction return a value ?
2356   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2357 
2358   Instruction *Cloned = Instr->clone();
2359   if (!IsVoidRetTy) {
2360     Cloned->setName(Instr->getName() + ".cloned");
2361 #if !defined(NDEBUG)
2362     // Verify that VPlan type inference results agree with the type of the
2363     // generated values.
2364     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2365            "inferred type and type from generated instructions do not match");
2366 #endif
2367   }
2368 
2369   RepRecipe->setFlags(Cloned);
2370 
2371   if (auto DL = Instr->getDebugLoc())
2372     State.setDebugLocFrom(DL);
2373 
2374   // Replace the operands of the cloned instructions with their scalar
2375   // equivalents in the new loop.
2376   for (const auto &I : enumerate(RepRecipe->operands())) {
2377     auto InputLane = Lane;
2378     VPValue *Operand = I.value();
2379     if (vputils::isUniformAfterVectorization(Operand))
2380       InputLane = VPLane::getFirstLane();
2381     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2382   }
2383   State.addNewMetadata(Cloned, Instr);
2384 
2385   // Place the cloned scalar in the new loop.
2386   State.Builder.Insert(Cloned);
2387 
2388   State.set(RepRecipe, Cloned, Lane);
2389 
2390   // If we just cloned a new assumption, add it the assumption cache.
2391   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2392     AC->registerAssumption(II);
2393 
2394   // End if-block.
2395   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2396   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2397   assert(
2398       (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2399        all_of(RepRecipe->operands(),
2400               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2401       "Expected a recipe is either within a region or all of its operands "
2402       "are defined outside the vectorized region.");
2403   if (IfPredicateInstr)
2404     PredicatedInstructions.push_back(Cloned);
2405 }
2406 
2407 Value *
2408 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2409   if (VectorTripCount)
2410     return VectorTripCount;
2411 
2412   Value *TC = getTripCount();
2413   IRBuilder<> Builder(InsertBlock->getTerminator());
2414 
2415   Type *Ty = TC->getType();
2416   // This is where we can make the step a runtime constant.
2417   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2418 
2419   // If the tail is to be folded by masking, round the number of iterations N
2420   // up to a multiple of Step instead of rounding down. This is done by first
2421   // adding Step-1 and then rounding down. Note that it's ok if this addition
2422   // overflows: the vector induction variable will eventually wrap to zero given
2423   // that it starts at zero and its Step is a power of two; the loop will then
2424   // exit, with the last early-exit vector comparison also producing all-true.
2425   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2426   // is accounted for in emitIterationCountCheck that adds an overflow check.
2427   if (Cost->foldTailByMasking()) {
2428     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2429            "VF*UF must be a power of 2 when folding tail by masking");
2430     TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2431                            "n.rnd.up");
2432   }
2433 
2434   // Now we need to generate the expression for the part of the loop that the
2435   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2436   // iterations are not required for correctness, or N - Step, otherwise. Step
2437   // is equal to the vectorization factor (number of SIMD elements) times the
2438   // unroll factor (number of SIMD instructions).
2439   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2440 
2441   // There are cases where we *must* run at least one iteration in the remainder
2442   // loop.  See the cost model for when this can happen.  If the step evenly
2443   // divides the trip count, we set the remainder to be equal to the step. If
2444   // the step does not evenly divide the trip count, no adjustment is necessary
2445   // since there will already be scalar iterations. Note that the minimum
2446   // iterations check ensures that N >= Step.
2447   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2448     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2449     R = Builder.CreateSelect(IsZero, Step, R);
2450   }
2451 
2452   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2453 
2454   return VectorTripCount;
2455 }
2456 
2457 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2458   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2459   VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2460   if (PreVectorPH->getNumSuccessors() != 1) {
2461     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2462     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2463            "Unexpected successor");
2464     VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2465     VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2466     PreVectorPH = CheckVPIRBB;
2467   }
2468   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2469   PreVectorPH->swapSuccessors();
2470 }
2471 
2472 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2473   Value *Count = getTripCount();
2474   // Reuse existing vector loop preheader for TC checks.
2475   // Note that new preheader block is generated for vector loop.
2476   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2477   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2478 
2479   // Generate code to check if the loop's trip count is less than VF * UF, or
2480   // equal to it in case a scalar epilogue is required; this implies that the
2481   // vector trip count is zero. This check also covers the case where adding one
2482   // to the backedge-taken count overflowed leading to an incorrect trip count
2483   // of zero. In this case we will also jump to the scalar loop.
2484   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2485                                                        : ICmpInst::ICMP_ULT;
2486 
2487   // If tail is to be folded, vector loop takes care of all iterations.
2488   Type *CountTy = Count->getType();
2489   Value *CheckMinIters = Builder.getFalse();
2490   auto CreateStep = [&]() -> Value * {
2491     // Create step with max(MinProTripCount, UF * VF).
2492     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2493       return createStepForVF(Builder, CountTy, VF, UF);
2494 
2495     Value *MinProfTC =
2496         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2497     if (!VF.isScalable())
2498       return MinProfTC;
2499     return Builder.CreateBinaryIntrinsic(
2500         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2501   };
2502 
2503   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2504   if (Style == TailFoldingStyle::None) {
2505     Value *Step = CreateStep();
2506     ScalarEvolution &SE = *PSE.getSE();
2507     // TODO: Emit unconditional branch to vector preheader instead of
2508     // conditional branch with known condition.
2509     const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2510     // Check if the trip count is < the step.
2511     if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2512       // TODO: Ensure step is at most the trip count when determining max VF and
2513       // UF, w/o tail folding.
2514       CheckMinIters = Builder.getTrue();
2515     } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),
2516                                     TripCountSCEV, SE.getSCEV(Step))) {
2517       // Generate the minimum iteration check only if we cannot prove the
2518       // check is known to be true, or known to be false.
2519       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2520     } // else step known to be < trip count, use CheckMinIters preset to false.
2521   } else if (VF.isScalable() &&
2522              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2523              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2524     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2525     // an overflow to zero when updating induction variables and so an
2526     // additional overflow check is required before entering the vector loop.
2527 
2528     // Get the maximum unsigned value for the type.
2529     Value *MaxUIntTripCount =
2530         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2531     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2532 
2533     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2534     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2535   }
2536 
2537   // Create new preheader for vector loop.
2538   LoopVectorPreHeader =
2539       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2540                  "vector.ph");
2541 
2542   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2543                                DT->getNode(Bypass)->getIDom()) &&
2544          "TC check is expected to dominate Bypass");
2545 
2546   BranchInst &BI =
2547       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2548   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2549     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2550   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2551   LoopBypassBlocks.push_back(TCCheckBlock);
2552 
2553   // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2554   introduceCheckBlockInVPlan(TCCheckBlock);
2555 }
2556 
2557 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2558   BasicBlock *const SCEVCheckBlock =
2559       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2560   if (!SCEVCheckBlock)
2561     return nullptr;
2562 
2563   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2564            (OptForSizeBasedOnProfile &&
2565             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2566          "Cannot SCEV check stride or overflow when optimizing for size");
2567   assert(!LoopBypassBlocks.empty() &&
2568          "Should already be a bypass block due to iteration count check");
2569   LoopBypassBlocks.push_back(SCEVCheckBlock);
2570   AddedSafetyChecks = true;
2571 
2572   introduceCheckBlockInVPlan(SCEVCheckBlock);
2573   return SCEVCheckBlock;
2574 }
2575 
2576 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2577   // VPlan-native path does not do any analysis for runtime checks currently.
2578   if (EnableVPlanNativePath)
2579     return nullptr;
2580 
2581   BasicBlock *const MemCheckBlock =
2582       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2583 
2584   // Check if we generated code that checks in runtime if arrays overlap. We put
2585   // the checks into a separate block to make the more common case of few
2586   // elements faster.
2587   if (!MemCheckBlock)
2588     return nullptr;
2589 
2590   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2591     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2592            "Cannot emit memory checks when optimizing for size, unless forced "
2593            "to vectorize.");
2594     ORE->emit([&]() {
2595       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2596                                         OrigLoop->getStartLoc(),
2597                                         OrigLoop->getHeader())
2598              << "Code-size may be reduced by not forcing "
2599                 "vectorization, or by source-code modifications "
2600                 "eliminating the need for runtime checks "
2601                 "(e.g., adding 'restrict').";
2602     });
2603   }
2604 
2605   LoopBypassBlocks.push_back(MemCheckBlock);
2606 
2607   AddedSafetyChecks = true;
2608 
2609   introduceCheckBlockInVPlan(MemCheckBlock);
2610   return MemCheckBlock;
2611 }
2612 
2613 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2614 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2615 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2616 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2617 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2618   VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2619   for (auto &R : make_early_inc_range(*VPBB)) {
2620     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2621     R.moveBefore(*IRVPBB, IRVPBB->end());
2622   }
2623 
2624   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2625   // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2626 }
2627 
2628 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2629   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2630   assert(LoopVectorPreHeader && "Invalid loop structure");
2631   assert((OrigLoop->getUniqueLatchExitBlock() ||
2632           Cost->requiresScalarEpilogue(VF.isVector())) &&
2633          "loops not exiting via the latch without required epilogue?");
2634 
2635   LoopMiddleBlock =
2636       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2637                  LI, nullptr, Twine(Prefix) + "middle.block");
2638   replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
2639   LoopScalarPreHeader =
2640       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2641                  nullptr, Twine(Prefix) + "scalar.ph");
2642   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
2643 }
2644 
2645 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2646 /// expansion results.
2647 static Value *getExpandedStep(const InductionDescriptor &ID,
2648                               const SCEV2ValueTy &ExpandedSCEVs) {
2649   const SCEV *Step = ID.getStep();
2650   if (auto *C = dyn_cast<SCEVConstant>(Step))
2651     return C->getValue();
2652   if (auto *U = dyn_cast<SCEVUnknown>(Step))
2653     return U->getValue();
2654   auto I = ExpandedSCEVs.find(Step);
2655   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2656   return I->second;
2657 }
2658 
2659 /// Knowing that loop \p L executes a single vector iteration, add instructions
2660 /// that will get simplified and thus should not have any cost to \p
2661 /// InstsToIgnore.
2662 static void addFullyUnrolledInstructionsToIgnore(
2663     Loop *L, const LoopVectorizationLegality::InductionList &IL,
2664     SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2665   auto *Cmp = L->getLatchCmpInst();
2666   if (Cmp)
2667     InstsToIgnore.insert(Cmp);
2668   for (const auto &KV : IL) {
2669     // Extract the key by hand so that it can be used in the lambda below.  Note
2670     // that captured structured bindings are a C++20 extension.
2671     const PHINode *IV = KV.first;
2672 
2673     // Get next iteration value of the induction variable.
2674     Instruction *IVInst =
2675         cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2676     if (all_of(IVInst->users(),
2677                [&](const User *U) { return U == IV || U == Cmp; }))
2678       InstsToIgnore.insert(IVInst);
2679   }
2680 }
2681 
2682 void InnerLoopVectorizer::createInductionAdditionalBypassValues(
2683     const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2684   assert(MainVectorTripCount && "Must have bypass information");
2685 
2686   Instruction *OldInduction = Legal->getPrimaryInduction();
2687   IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2688                             getAdditionalBypassBlock()->getFirstInsertionPt());
2689   for (const auto &InductionEntry : Legal->getInductionVars()) {
2690     PHINode *OrigPhi = InductionEntry.first;
2691     const InductionDescriptor &II = InductionEntry.second;
2692     Value *Step = getExpandedStep(II, ExpandedSCEVs);
2693     // For the primary induction the additional bypass end value is known.
2694     // Otherwise it is computed.
2695     Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2696     if (OrigPhi != OldInduction) {
2697       auto *BinOp = II.getInductionBinOp();
2698       // Fast-math-flags propagate from the original induction instruction.
2699       if (isa_and_nonnull<FPMathOperator>(BinOp))
2700         BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2701 
2702       // Compute the end value for the additional bypass.
2703       EndValueFromAdditionalBypass =
2704           emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2705                                II.getStartValue(), Step, II.getKind(), BinOp);
2706       EndValueFromAdditionalBypass->setName("ind.end");
2707     }
2708 
2709     // Store the bypass value here, as it needs to be added as operand to its
2710     // scalar preheader phi node after the epilogue skeleton has been created.
2711     // TODO: Directly add as extra operand to the VPResumePHI recipe.
2712     assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2713            "entry for OrigPhi already exits");
2714     Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2715   }
2716 }
2717 
2718 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2719     const SCEV2ValueTy &ExpandedSCEVs) {
2720   /*
2721    In this function we generate a new loop. The new loop will contain
2722    the vectorized instructions while the old loop will continue to run the
2723    scalar remainder.
2724 
2725        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2726      /  |      preheader are expanded here. Eventually all required SCEV
2727     /   |      expansion should happen here.
2728    /    v
2729   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2730   |  /  |
2731   | /   v
2732   ||   [ ]     <-- vector pre header.
2733   |/    |
2734   |     v
2735   |    [  ] \
2736   |    [  ]_|   <-- vector loop (created during VPlan execution).
2737   |     |
2738   |     v
2739   \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2740    |    |                       successors created during VPlan execution)
2741    \/   |
2742    /\   v
2743    | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
2744    |    |
2745  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
2746    |   [ ] \
2747    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue, header
2748    |    |          wrapped in VPIRBasicBlock).
2749     \   |
2750      \  v
2751       >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
2752    ...
2753    */
2754 
2755   // Create an empty vector loop, and prepare basic blocks for the runtime
2756   // checks.
2757   createVectorLoopSkeleton("");
2758 
2759   // Now, compare the new count to zero. If it is zero skip the vector loop and
2760   // jump to the scalar loop. This check also covers the case where the
2761   // backedge-taken count is uint##_max: adding one to it will overflow leading
2762   // to an incorrect trip count of zero. In this (rare) case we will also jump
2763   // to the scalar loop.
2764   emitIterationCountCheck(LoopScalarPreHeader);
2765 
2766   // Generate the code to check any assumptions that we've made for SCEV
2767   // expressions.
2768   emitSCEVChecks(LoopScalarPreHeader);
2769 
2770   // Generate the code that checks in runtime if arrays overlap. We put the
2771   // checks into a separate block to make the more common case of few elements
2772   // faster.
2773   emitMemRuntimeChecks(LoopScalarPreHeader);
2774 
2775   return LoopVectorPreHeader;
2776 }
2777 
2778 // Fix up external users of the induction variable. At this point, we are
2779 // in LCSSA form, with all external PHIs that use the IV having one input value,
2780 // coming from the remainder loop. We need those PHIs to also have a correct
2781 // value for the IV when arriving directly from the middle block.
2782 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2783                                        const InductionDescriptor &II,
2784                                        Value *VectorTripCount,
2785                                        BasicBlock *MiddleBlock,
2786                                        VPTransformState &State) {
2787   // There are two kinds of external IV usages - those that use the value
2788   // computed in the last iteration (the PHI) and those that use the penultimate
2789   // value (the value that feeds into the phi from the loop latch).
2790   // We allow both, but they, obviously, have different values.
2791 
2792   DenseMap<Value *, Value *> MissingVals;
2793 
2794   Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
2795                                       OrigLoop->getLoopPreheader()))
2796                         ->getIncomingValueForBlock(MiddleBlock);
2797 
2798   // An external user of the last iteration's value should see the value that
2799   // the remainder loop uses to initialize its own IV.
2800   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2801   for (User *U : PostInc->users()) {
2802     Instruction *UI = cast<Instruction>(U);
2803     if (!OrigLoop->contains(UI)) {
2804       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2805       MissingVals[UI] = EndValue;
2806     }
2807   }
2808 
2809   // An external user of the penultimate value need to see EndValue - Step.
2810   // The simplest way to get this is to recompute it from the constituent SCEVs,
2811   // that is Start + (Step * (CRD - 1)).
2812   for (User *U : OrigPhi->users()) {
2813     auto *UI = cast<Instruction>(U);
2814     if (!OrigLoop->contains(UI)) {
2815       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2816       IRBuilder<> B(MiddleBlock->getTerminator());
2817 
2818       // Fast-math-flags propagate from the original induction instruction.
2819       if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
2820         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2821 
2822       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2823       assert(StepVPV && "step must have been expanded during VPlan execution");
2824       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2825                                         : State.get(StepVPV, VPLane(0));
2826       Value *Escape = nullptr;
2827       if (EndValue->getType()->isIntegerTy())
2828         Escape = B.CreateSub(EndValue, Step);
2829       else if (EndValue->getType()->isPointerTy())
2830         Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
2831       else {
2832         assert(EndValue->getType()->isFloatingPointTy() &&
2833                "Unexpected induction type");
2834         Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
2835                                        Instruction::FAdd
2836                                    ? Instruction::FSub
2837                                    : Instruction::FAdd,
2838                                EndValue, Step);
2839       }
2840       Escape->setName("ind.escape");
2841       MissingVals[UI] = Escape;
2842     }
2843   }
2844 
2845   assert((MissingVals.empty() ||
2846           all_of(MissingVals,
2847                  [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
2848                    return all_of(
2849                        predecessors(cast<Instruction>(P.first)->getParent()),
2850                        [MiddleBlock, this](BasicBlock *Pred) {
2851                          return Pred == MiddleBlock ||
2852                                 Pred == OrigLoop->getLoopLatch();
2853                        });
2854                  })) &&
2855          "Expected escaping values from latch/middle.block only");
2856 
2857   for (auto &I : MissingVals) {
2858     PHINode *PHI = cast<PHINode>(I.first);
2859     // One corner case we have to handle is two IVs "chasing" each-other,
2860     // that is %IV2 = phi [...], [ %IV1, %latch ]
2861     // In this case, if IV1 has an external use, we need to avoid adding both
2862     // "last value of IV1" and "penultimate value of IV2". So, verify that we
2863     // don't already have an incoming value for the middle block.
2864     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2865       PHI->addIncoming(I.second, MiddleBlock);
2866   }
2867 }
2868 
2869 namespace {
2870 
2871 struct CSEDenseMapInfo {
2872   static bool canHandle(const Instruction *I) {
2873     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2874            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2875   }
2876 
2877   static inline Instruction *getEmptyKey() {
2878     return DenseMapInfo<Instruction *>::getEmptyKey();
2879   }
2880 
2881   static inline Instruction *getTombstoneKey() {
2882     return DenseMapInfo<Instruction *>::getTombstoneKey();
2883   }
2884 
2885   static unsigned getHashValue(const Instruction *I) {
2886     assert(canHandle(I) && "Unknown instruction!");
2887     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2888                                                            I->value_op_end()));
2889   }
2890 
2891   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2892     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2893         LHS == getTombstoneKey() || RHS == getTombstoneKey())
2894       return LHS == RHS;
2895     return LHS->isIdenticalTo(RHS);
2896   }
2897 };
2898 
2899 } // end anonymous namespace
2900 
2901 ///Perform cse of induction variable instructions.
2902 static void cse(BasicBlock *BB) {
2903   // Perform simple cse.
2904   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2905   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2906     if (!CSEDenseMapInfo::canHandle(&In))
2907       continue;
2908 
2909     // Check if we can replace this instruction with any of the
2910     // visited instructions.
2911     if (Instruction *V = CSEMap.lookup(&In)) {
2912       In.replaceAllUsesWith(V);
2913       In.eraseFromParent();
2914       continue;
2915     }
2916 
2917     CSEMap[&In] = &In;
2918   }
2919 }
2920 
2921 InstructionCost
2922 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2923                                               ElementCount VF) const {
2924   // We only need to calculate a cost if the VF is scalar; for actual vectors
2925   // we should already have a pre-calculated cost at each VF.
2926   if (!VF.isScalar())
2927     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2928 
2929   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2930   Type *RetTy = CI->getType();
2931   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2932     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2933       return *RedCost;
2934 
2935   SmallVector<Type *, 4> Tys;
2936   for (auto &ArgOp : CI->args())
2937     Tys.push_back(ArgOp->getType());
2938 
2939   InstructionCost ScalarCallCost =
2940       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2941 
2942   // If this is an intrinsic we may have a lower cost for it.
2943   if (getVectorIntrinsicIDForCall(CI, TLI)) {
2944     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2945     return std::min(ScalarCallCost, IntrinsicCost);
2946   }
2947   return ScalarCallCost;
2948 }
2949 
2950 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2951   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2952     return Elt;
2953   return VectorType::get(Elt, VF);
2954 }
2955 
2956 InstructionCost
2957 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2958                                                    ElementCount VF) const {
2959   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2960   assert(ID && "Expected intrinsic call!");
2961   Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2962   FastMathFlags FMF;
2963   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2964     FMF = FPMO->getFastMathFlags();
2965 
2966   SmallVector<const Value *> Arguments(CI->args());
2967   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2968   SmallVector<Type *> ParamTys;
2969   std::transform(FTy->param_begin(), FTy->param_end(),
2970                  std::back_inserter(ParamTys),
2971                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2972 
2973   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2974                                     dyn_cast<IntrinsicInst>(CI));
2975   return TTI.getIntrinsicInstrCost(CostAttrs,
2976                                    TargetTransformInfo::TCK_RecipThroughput);
2977 }
2978 
2979 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2980   // Fix widened non-induction PHIs by setting up the PHI operands.
2981   if (EnableVPlanNativePath)
2982     fixNonInductionPHIs(State);
2983 
2984   // Forget the original basic block.
2985   PSE.getSE()->forgetLoop(OrigLoop);
2986   PSE.getSE()->forgetBlockAndLoopDispositions();
2987 
2988   // After vectorization, the exit blocks of the original loop will have
2989   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2990   // looked through single-entry phis.
2991   SmallVector<BasicBlock *> ExitBlocks;
2992   OrigLoop->getExitBlocks(ExitBlocks);
2993   for (BasicBlock *Exit : ExitBlocks)
2994     for (PHINode &PN : Exit->phis())
2995       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2996 
2997   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2998     // No edge from the middle block to the unique exit block has been inserted
2999     // and there is nothing to fix from vector loop; phis should have incoming
3000     // from scalar loop only.
3001   } else {
3002     // TODO: Check in VPlan to see if IV users need fixing instead of checking
3003     // the cost model.
3004 
3005     // If we inserted an edge from the middle block to the unique exit block,
3006     // update uses outside the loop (phis) to account for the newly inserted
3007     // edge.
3008 
3009     // Fix-up external users of the induction variables.
3010     for (const auto &Entry : Legal->getInductionVars())
3011       fixupIVUsers(Entry.first, Entry.second,
3012                    getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
3013   }
3014 
3015   // Don't apply optimizations below when no vector region remains, as they all
3016   // require a vector loop at the moment.
3017   if (!State.Plan->getVectorLoopRegion())
3018     return;
3019 
3020   for (Instruction *PI : PredicatedInstructions)
3021     sinkScalarOperands(&*PI);
3022 
3023   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3024   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3025   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
3026 
3027   // Remove redundant induction instructions.
3028   cse(HeaderBB);
3029 
3030   // Set/update profile weights for the vector and remainder loops as original
3031   // loop iterations are now distributed among them. Note that original loop
3032   // becomes the scalar remainder loop after vectorization.
3033   //
3034   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3035   // end up getting slightly roughened result but that should be OK since
3036   // profile is not inherently precise anyway. Note also possible bypass of
3037   // vector code caused by legality checks is ignored, assigning all the weight
3038   // to the vector loop, optimistically.
3039   //
3040   // For scalable vectorization we can't know at compile time how many
3041   // iterations of the loop are handled in one vector iteration, so instead
3042   // assume a pessimistic vscale of '1'.
3043   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
3044   setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
3045                                VF.getKnownMinValue() * UF);
3046 }
3047 
3048 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3049   // The basic block and loop containing the predicated instruction.
3050   auto *PredBB = PredInst->getParent();
3051   auto *VectorLoop = LI->getLoopFor(PredBB);
3052 
3053   // Initialize a worklist with the operands of the predicated instruction.
3054   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3055 
3056   // Holds instructions that we need to analyze again. An instruction may be
3057   // reanalyzed if we don't yet know if we can sink it or not.
3058   SmallVector<Instruction *, 8> InstsToReanalyze;
3059 
3060   // Returns true if a given use occurs in the predicated block. Phi nodes use
3061   // their operands in their corresponding predecessor blocks.
3062   auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
3063     auto *I = cast<Instruction>(U.getUser());
3064     BasicBlock *BB = I->getParent();
3065     if (auto *Phi = dyn_cast<PHINode>(I))
3066       BB = Phi->getIncomingBlock(
3067           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3068     return BB == PredBB;
3069   };
3070 
3071   // Iteratively sink the scalarized operands of the predicated instruction
3072   // into the block we created for it. When an instruction is sunk, it's
3073   // operands are then added to the worklist. The algorithm ends after one pass
3074   // through the worklist doesn't sink a single instruction.
3075   bool Changed;
3076   do {
3077     // Add the instructions that need to be reanalyzed to the worklist, and
3078     // reset the changed indicator.
3079     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3080     InstsToReanalyze.clear();
3081     Changed = false;
3082 
3083     while (!Worklist.empty()) {
3084       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3085 
3086       // We can't sink an instruction if it is a phi node, is not in the loop,
3087       // may have side effects or may read from memory.
3088       // TODO: Could do more granular checking to allow sinking
3089       // a load past non-store instructions.
3090       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3091           I->mayHaveSideEffects() || I->mayReadFromMemory())
3092           continue;
3093 
3094       // If the instruction is already in PredBB, check if we can sink its
3095       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3096       // sinking the scalar instruction I, hence it appears in PredBB; but it
3097       // may have failed to sink I's operands (recursively), which we try
3098       // (again) here.
3099       if (I->getParent() == PredBB) {
3100         Worklist.insert(I->op_begin(), I->op_end());
3101         continue;
3102       }
3103 
3104       // It's legal to sink the instruction if all its uses occur in the
3105       // predicated block. Otherwise, there's nothing to do yet, and we may
3106       // need to reanalyze the instruction.
3107       if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
3108         InstsToReanalyze.push_back(I);
3109         continue;
3110       }
3111 
3112       // Move the instruction to the beginning of the predicated block, and add
3113       // it's operands to the worklist.
3114       I->moveBefore(&*PredBB->getFirstInsertionPt());
3115       Worklist.insert(I->op_begin(), I->op_end());
3116 
3117       // The sinking may have enabled other instructions to be sunk, so we will
3118       // need to iterate.
3119       Changed = true;
3120     }
3121   } while (Changed);
3122 }
3123 
3124 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
3125   auto Iter = vp_depth_first_deep(Plan.getEntry());
3126   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3127     for (VPRecipeBase &P : VPBB->phis()) {
3128       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3129       if (!VPPhi)
3130         continue;
3131       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3132       // Make sure the builder has a valid insert point.
3133       Builder.SetInsertPoint(NewPhi);
3134       for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3135         VPValue *Inc = VPPhi->getIncomingValue(Idx);
3136         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3137         NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3138       }
3139     }
3140   }
3141 }
3142 
3143 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3144   // We should not collect Scalars more than once per VF. Right now, this
3145   // function is called from collectUniformsAndScalars(), which already does
3146   // this check. Collecting Scalars for VF=1 does not make any sense.
3147   assert(VF.isVector() && !Scalars.contains(VF) &&
3148          "This function should not be visited twice for the same VF");
3149 
3150   // This avoids any chances of creating a REPLICATE recipe during planning
3151   // since that would result in generation of scalarized code during execution,
3152   // which is not supported for scalable vectors.
3153   if (VF.isScalable()) {
3154     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3155     return;
3156   }
3157 
3158   SmallSetVector<Instruction *, 8> Worklist;
3159 
3160   // These sets are used to seed the analysis with pointers used by memory
3161   // accesses that will remain scalar.
3162   SmallSetVector<Instruction *, 8> ScalarPtrs;
3163   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3164   auto *Latch = TheLoop->getLoopLatch();
3165 
3166   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3167   // The pointer operands of loads and stores will be scalar as long as the
3168   // memory access is not a gather or scatter operation. The value operand of a
3169   // store will remain scalar if the store is scalarized.
3170   auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3171     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3172     assert(WideningDecision != CM_Unknown &&
3173            "Widening decision should be ready at this moment");
3174     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3175       if (Ptr == Store->getValueOperand())
3176         return WideningDecision == CM_Scalarize;
3177     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3178            "Ptr is neither a value or pointer operand");
3179     return WideningDecision != CM_GatherScatter;
3180   };
3181 
3182   // A helper that returns true if the given value is a getelementptr
3183   // instruction contained in the loop.
3184   auto IsLoopVaryingGEP = [&](Value *V) {
3185     return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3186   };
3187 
3188   // A helper that evaluates a memory access's use of a pointer. If the use will
3189   // be a scalar use and the pointer is only used by memory accesses, we place
3190   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3191   // PossibleNonScalarPtrs.
3192   auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3193     // We only care about bitcast and getelementptr instructions contained in
3194     // the loop.
3195     if (!IsLoopVaryingGEP(Ptr))
3196       return;
3197 
3198     // If the pointer has already been identified as scalar (e.g., if it was
3199     // also identified as uniform), there's nothing to do.
3200     auto *I = cast<Instruction>(Ptr);
3201     if (Worklist.count(I))
3202       return;
3203 
3204     // If the use of the pointer will be a scalar use, and all users of the
3205     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3206     // place the pointer in PossibleNonScalarPtrs.
3207     if (IsScalarUse(MemAccess, Ptr) &&
3208         all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3209       ScalarPtrs.insert(I);
3210     else
3211       PossibleNonScalarPtrs.insert(I);
3212   };
3213 
3214   // We seed the scalars analysis with three classes of instructions: (1)
3215   // instructions marked uniform-after-vectorization and (2) bitcast,
3216   // getelementptr and (pointer) phi instructions used by memory accesses
3217   // requiring a scalar use.
3218   //
3219   // (1) Add to the worklist all instructions that have been identified as
3220   // uniform-after-vectorization.
3221   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3222 
3223   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3224   // memory accesses requiring a scalar use. The pointer operands of loads and
3225   // stores will be scalar unless the operation is a gather or scatter.
3226   // The value operand of a store will remain scalar if the store is scalarized.
3227   for (auto *BB : TheLoop->blocks())
3228     for (auto &I : *BB) {
3229       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3230         EvaluatePtrUse(Load, Load->getPointerOperand());
3231       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3232         EvaluatePtrUse(Store, Store->getPointerOperand());
3233         EvaluatePtrUse(Store, Store->getValueOperand());
3234       }
3235     }
3236   for (auto *I : ScalarPtrs)
3237     if (!PossibleNonScalarPtrs.count(I)) {
3238       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3239       Worklist.insert(I);
3240     }
3241 
3242   // Insert the forced scalars.
3243   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3244   // induction variable when the PHI user is scalarized.
3245   auto ForcedScalar = ForcedScalars.find(VF);
3246   if (ForcedScalar != ForcedScalars.end())
3247     for (auto *I : ForcedScalar->second) {
3248       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3249       Worklist.insert(I);
3250     }
3251 
3252   // Expand the worklist by looking through any bitcasts and getelementptr
3253   // instructions we've already identified as scalar. This is similar to the
3254   // expansion step in collectLoopUniforms(); however, here we're only
3255   // expanding to include additional bitcasts and getelementptr instructions.
3256   unsigned Idx = 0;
3257   while (Idx != Worklist.size()) {
3258     Instruction *Dst = Worklist[Idx++];
3259     if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3260       continue;
3261     auto *Src = cast<Instruction>(Dst->getOperand(0));
3262     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3263           auto *J = cast<Instruction>(U);
3264           return !TheLoop->contains(J) || Worklist.count(J) ||
3265                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3266                   IsScalarUse(J, Src));
3267         })) {
3268       Worklist.insert(Src);
3269       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3270     }
3271   }
3272 
3273   // An induction variable will remain scalar if all users of the induction
3274   // variable and induction variable update remain scalar.
3275   for (const auto &Induction : Legal->getInductionVars()) {
3276     auto *Ind = Induction.first;
3277     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3278 
3279     // If tail-folding is applied, the primary induction variable will be used
3280     // to feed a vector compare.
3281     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3282       continue;
3283 
3284     // Returns true if \p Indvar is a pointer induction that is used directly by
3285     // load/store instruction \p I.
3286     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3287                                               Instruction *I) {
3288       return Induction.second.getKind() ==
3289                  InductionDescriptor::IK_PtrInduction &&
3290              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3291              Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3292     };
3293 
3294     // Determine if all users of the induction variable are scalar after
3295     // vectorization.
3296     bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3297       auto *I = cast<Instruction>(U);
3298       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3299              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3300     });
3301     if (!ScalarInd)
3302       continue;
3303 
3304     // If the induction variable update is a fixed-order recurrence, neither the
3305     // induction variable or its update should be marked scalar after
3306     // vectorization.
3307     auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3308     if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3309       continue;
3310 
3311     // Determine if all users of the induction variable update instruction are
3312     // scalar after vectorization.
3313     bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3314       auto *I = cast<Instruction>(U);
3315       return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3316              IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3317     });
3318     if (!ScalarIndUpdate)
3319       continue;
3320 
3321     // The induction variable and its update instruction will remain scalar.
3322     Worklist.insert(Ind);
3323     Worklist.insert(IndUpdate);
3324     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3325     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3326                       << "\n");
3327   }
3328 
3329   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3330 }
3331 
3332 bool LoopVectorizationCostModel::isScalarWithPredication(
3333     Instruction *I, ElementCount VF) const {
3334   if (!isPredicatedInst(I))
3335     return false;
3336 
3337   // Do we have a non-scalar lowering for this predicated
3338   // instruction? No - it is scalar with predication.
3339   switch(I->getOpcode()) {
3340   default:
3341     return true;
3342   case Instruction::Call:
3343     if (VF.isScalar())
3344       return true;
3345     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3346                .Kind == CM_Scalarize;
3347   case Instruction::Load:
3348   case Instruction::Store: {
3349     auto *Ptr = getLoadStorePointerOperand(I);
3350     auto *Ty = getLoadStoreType(I);
3351     Type *VTy = Ty;
3352     if (VF.isVector())
3353       VTy = VectorType::get(Ty, VF);
3354     const Align Alignment = getLoadStoreAlignment(I);
3355     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3356                                 TTI.isLegalMaskedGather(VTy, Alignment))
3357                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3358                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3359   }
3360   case Instruction::UDiv:
3361   case Instruction::SDiv:
3362   case Instruction::SRem:
3363   case Instruction::URem: {
3364     // We have the option to use the safe-divisor idiom to avoid predication.
3365     // The cost based decision here will always select safe-divisor for
3366     // scalable vectors as scalarization isn't legal.
3367     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3368     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3369   }
3370   }
3371 }
3372 
3373 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3374 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3375   // If predication is not needed, avoid it.
3376   // TODO: We can use the loop-preheader as context point here and get
3377   // context sensitive reasoning for isSafeToSpeculativelyExecute.
3378   if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3379       isSafeToSpeculativelyExecute(I) ||
3380       (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3381       isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3382     return false;
3383 
3384   // If the instruction was executed conditionally in the original scalar loop,
3385   // predication is needed with a mask whose lanes are all possibly inactive.
3386   if (Legal->blockNeedsPredication(I->getParent()))
3387     return true;
3388 
3389   // All that remain are instructions with side-effects originally executed in
3390   // the loop unconditionally, but now execute under a tail-fold mask (only)
3391   // having at least one active lane (the first). If the side-effects of the
3392   // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3393   // - it will cause the same side-effects as when masked.
3394   switch(I->getOpcode()) {
3395   default:
3396     llvm_unreachable(
3397         "instruction should have been considered by earlier checks");
3398   case Instruction::Call:
3399     // Side-effects of a Call are assumed to be non-invariant, needing a
3400     // (fold-tail) mask.
3401     assert(Legal->isMaskRequired(I) &&
3402            "should have returned earlier for calls not needing a mask");
3403     return true;
3404   case Instruction::Load:
3405     // If the address is loop invariant no predication is needed.
3406     return !Legal->isInvariant(getLoadStorePointerOperand(I));
3407   case Instruction::Store: {
3408     // For stores, we need to prove both speculation safety (which follows from
3409     // the same argument as loads), but also must prove the value being stored
3410     // is correct.  The easiest form of the later is to require that all values
3411     // stored are the same.
3412     return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3413              TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3414   }
3415   case Instruction::UDiv:
3416   case Instruction::SDiv:
3417   case Instruction::SRem:
3418   case Instruction::URem:
3419     // If the divisor is loop-invariant no predication is needed.
3420     return !TheLoop->isLoopInvariant(I->getOperand(1));
3421   }
3422 }
3423 
3424 std::pair<InstructionCost, InstructionCost>
3425 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3426                                                     ElementCount VF) const {
3427   assert(I->getOpcode() == Instruction::UDiv ||
3428          I->getOpcode() == Instruction::SDiv ||
3429          I->getOpcode() == Instruction::SRem ||
3430          I->getOpcode() == Instruction::URem);
3431   assert(!isSafeToSpeculativelyExecute(I));
3432 
3433   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3434 
3435   // Scalarization isn't legal for scalable vector types
3436   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3437   if (!VF.isScalable()) {
3438     // Get the scalarization cost and scale this amount by the probability of
3439     // executing the predicated block. If the instruction is not predicated,
3440     // we fall through to the next case.
3441     ScalarizationCost = 0;
3442 
3443     // These instructions have a non-void type, so account for the phi nodes
3444     // that we will create. This cost is likely to be zero. The phi node
3445     // cost, if any, should be scaled by the block probability because it
3446     // models a copy at the end of each predicated block.
3447     ScalarizationCost += VF.getKnownMinValue() *
3448       TTI.getCFInstrCost(Instruction::PHI, CostKind);
3449 
3450     // The cost of the non-predicated instruction.
3451     ScalarizationCost += VF.getKnownMinValue() *
3452       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3453 
3454     // The cost of insertelement and extractelement instructions needed for
3455     // scalarization.
3456     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3457 
3458     // Scale the cost by the probability of executing the predicated blocks.
3459     // This assumes the predicated block for each vector lane is equally
3460     // likely.
3461     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3462   }
3463   InstructionCost SafeDivisorCost = 0;
3464 
3465   auto *VecTy = toVectorTy(I->getType(), VF);
3466 
3467   // The cost of the select guard to ensure all lanes are well defined
3468   // after we speculate above any internal control flow.
3469   SafeDivisorCost +=
3470       TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3471                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3472                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
3473 
3474   // Certain instructions can be cheaper to vectorize if they have a constant
3475   // second vector operand. One example of this are shifts on x86.
3476   Value *Op2 = I->getOperand(1);
3477   auto Op2Info = TTI.getOperandInfo(Op2);
3478   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3479       Legal->isInvariant(Op2))
3480     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3481 
3482   SmallVector<const Value *, 4> Operands(I->operand_values());
3483   SafeDivisorCost += TTI.getArithmeticInstrCost(
3484     I->getOpcode(), VecTy, CostKind,
3485     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3486     Op2Info, Operands, I);
3487   return {ScalarizationCost, SafeDivisorCost};
3488 }
3489 
3490 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3491     Instruction *I, ElementCount VF) const {
3492   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3493   assert(getWideningDecision(I, VF) == CM_Unknown &&
3494          "Decision should not be set yet.");
3495   auto *Group = getInterleavedAccessGroup(I);
3496   assert(Group && "Must have a group.");
3497   unsigned InterleaveFactor = Group->getFactor();
3498 
3499   // If the instruction's allocated size doesn't equal its type size, it
3500   // requires padding and will be scalarized.
3501   auto &DL = I->getDataLayout();
3502   auto *ScalarTy = getLoadStoreType(I);
3503   if (hasIrregularType(ScalarTy, DL))
3504     return false;
3505 
3506   // We currently only know how to emit interleave/deinterleave with
3507   // Factor=2 for scalable vectors. This is purely an implementation
3508   // limit.
3509   if (VF.isScalable() && InterleaveFactor != 2)
3510     return false;
3511 
3512   // If the group involves a non-integral pointer, we may not be able to
3513   // losslessly cast all values to a common type.
3514   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3515   for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3516     Instruction *Member = Group->getMember(Idx);
3517     if (!Member)
3518       continue;
3519     auto *MemberTy = getLoadStoreType(Member);
3520     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3521     // Don't coerce non-integral pointers to integers or vice versa.
3522     if (MemberNI != ScalarNI)
3523       // TODO: Consider adding special nullptr value case here
3524       return false;
3525     if (MemberNI && ScalarNI &&
3526         ScalarTy->getPointerAddressSpace() !=
3527             MemberTy->getPointerAddressSpace())
3528       return false;
3529   }
3530 
3531   // Check if masking is required.
3532   // A Group may need masking for one of two reasons: it resides in a block that
3533   // needs predication, or it was decided to use masking to deal with gaps
3534   // (either a gap at the end of a load-access that may result in a speculative
3535   // load, or any gaps in a store-access).
3536   bool PredicatedAccessRequiresMasking =
3537       blockNeedsPredicationForAnyReason(I->getParent()) &&
3538       Legal->isMaskRequired(I);
3539   bool LoadAccessWithGapsRequiresEpilogMasking =
3540       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3541       !isScalarEpilogueAllowed();
3542   bool StoreAccessWithGapsRequiresMasking =
3543       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3544   if (!PredicatedAccessRequiresMasking &&
3545       !LoadAccessWithGapsRequiresEpilogMasking &&
3546       !StoreAccessWithGapsRequiresMasking)
3547     return true;
3548 
3549   // If masked interleaving is required, we expect that the user/target had
3550   // enabled it, because otherwise it either wouldn't have been created or
3551   // it should have been invalidated by the CostModel.
3552   assert(useMaskedInterleavedAccesses(TTI) &&
3553          "Masked interleave-groups for predicated accesses are not enabled.");
3554 
3555   if (Group->isReverse())
3556     return false;
3557 
3558   auto *Ty = getLoadStoreType(I);
3559   const Align Alignment = getLoadStoreAlignment(I);
3560   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3561                           : TTI.isLegalMaskedStore(Ty, Alignment);
3562 }
3563 
3564 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3565     Instruction *I, ElementCount VF) {
3566   // Get and ensure we have a valid memory instruction.
3567   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3568 
3569   auto *Ptr = getLoadStorePointerOperand(I);
3570   auto *ScalarTy = getLoadStoreType(I);
3571 
3572   // In order to be widened, the pointer should be consecutive, first of all.
3573   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3574     return false;
3575 
3576   // If the instruction is a store located in a predicated block, it will be
3577   // scalarized.
3578   if (isScalarWithPredication(I, VF))
3579     return false;
3580 
3581   // If the instruction's allocated size doesn't equal it's type size, it
3582   // requires padding and will be scalarized.
3583   auto &DL = I->getDataLayout();
3584   if (hasIrregularType(ScalarTy, DL))
3585     return false;
3586 
3587   return true;
3588 }
3589 
3590 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3591   // We should not collect Uniforms more than once per VF. Right now,
3592   // this function is called from collectUniformsAndScalars(), which
3593   // already does this check. Collecting Uniforms for VF=1 does not make any
3594   // sense.
3595 
3596   assert(VF.isVector() && !Uniforms.contains(VF) &&
3597          "This function should not be visited twice for the same VF");
3598 
3599   // Visit the list of Uniforms. If we find no uniform value, we won't
3600   // analyze again.  Uniforms.count(VF) will return 1.
3601   Uniforms[VF].clear();
3602 
3603   // Now we know that the loop is vectorizable!
3604   // Collect instructions inside the loop that will remain uniform after
3605   // vectorization.
3606 
3607   // Global values, params and instructions outside of current loop are out of
3608   // scope.
3609   auto IsOutOfScope = [&](Value *V) -> bool {
3610     Instruction *I = dyn_cast<Instruction>(V);
3611     return (!I || !TheLoop->contains(I));
3612   };
3613 
3614   // Worklist containing uniform instructions demanding lane 0.
3615   SetVector<Instruction *> Worklist;
3616 
3617   // Add uniform instructions demanding lane 0 to the worklist. Instructions
3618   // that require predication must not be considered uniform after
3619   // vectorization, because that would create an erroneous replicating region
3620   // where only a single instance out of VF should be formed.
3621   auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3622     if (IsOutOfScope(I)) {
3623       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3624                         << *I << "\n");
3625       return;
3626     }
3627     if (isPredicatedInst(I)) {
3628       LLVM_DEBUG(
3629           dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3630                  << "\n");
3631       return;
3632     }
3633     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3634     Worklist.insert(I);
3635   };
3636 
3637   // Start with the conditional branches exiting the loop. If the branch
3638   // condition is an instruction contained in the loop that is only used by the
3639   // branch, it is uniform. Note conditions from uncountable early exits are not
3640   // uniform.
3641   SmallVector<BasicBlock *> Exiting;
3642   TheLoop->getExitingBlocks(Exiting);
3643   for (BasicBlock *E : Exiting) {
3644     if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3645       continue;
3646     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3647     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3648       AddToWorklistIfAllowed(Cmp);
3649   }
3650 
3651   auto PrevVF = VF.divideCoefficientBy(2);
3652   // Return true if all lanes perform the same memory operation, and we can
3653   // thus choose to execute only one.
3654   auto IsUniformMemOpUse = [&](Instruction *I) {
3655     // If the value was already known to not be uniform for the previous
3656     // (smaller VF), it cannot be uniform for the larger VF.
3657     if (PrevVF.isVector()) {
3658       auto Iter = Uniforms.find(PrevVF);
3659       if (Iter != Uniforms.end() && !Iter->second.contains(I))
3660         return false;
3661     }
3662     if (!Legal->isUniformMemOp(*I, VF))
3663       return false;
3664     if (isa<LoadInst>(I))
3665       // Loading the same address always produces the same result - at least
3666       // assuming aliasing and ordering which have already been checked.
3667       return true;
3668     // Storing the same value on every iteration.
3669     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3670   };
3671 
3672   auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3673     InstWidening WideningDecision = getWideningDecision(I, VF);
3674     assert(WideningDecision != CM_Unknown &&
3675            "Widening decision should be ready at this moment");
3676 
3677     if (IsUniformMemOpUse(I))
3678       return true;
3679 
3680     return (WideningDecision == CM_Widen ||
3681             WideningDecision == CM_Widen_Reverse ||
3682             WideningDecision == CM_Interleave);
3683   };
3684 
3685   // Returns true if Ptr is the pointer operand of a memory access instruction
3686   // I, I is known to not require scalarization, and the pointer is not also
3687   // stored.
3688   auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3689     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3690       return false;
3691     return getLoadStorePointerOperand(I) == Ptr &&
3692            (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3693   };
3694 
3695   // Holds a list of values which are known to have at least one uniform use.
3696   // Note that there may be other uses which aren't uniform.  A "uniform use"
3697   // here is something which only demands lane 0 of the unrolled iterations;
3698   // it does not imply that all lanes produce the same value (e.g. this is not
3699   // the usual meaning of uniform)
3700   SetVector<Value *> HasUniformUse;
3701 
3702   // Scan the loop for instructions which are either a) known to have only
3703   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3704   for (auto *BB : TheLoop->blocks())
3705     for (auto &I : *BB) {
3706       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3707         switch (II->getIntrinsicID()) {
3708         case Intrinsic::sideeffect:
3709         case Intrinsic::experimental_noalias_scope_decl:
3710         case Intrinsic::assume:
3711         case Intrinsic::lifetime_start:
3712         case Intrinsic::lifetime_end:
3713           if (TheLoop->hasLoopInvariantOperands(&I))
3714             AddToWorklistIfAllowed(&I);
3715           break;
3716         default:
3717           break;
3718         }
3719       }
3720 
3721       // ExtractValue instructions must be uniform, because the operands are
3722       // known to be loop-invariant.
3723       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3724         assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3725                "Expected aggregate value to be loop invariant");
3726         AddToWorklistIfAllowed(EVI);
3727         continue;
3728       }
3729 
3730       // If there's no pointer operand, there's nothing to do.
3731       auto *Ptr = getLoadStorePointerOperand(&I);
3732       if (!Ptr)
3733         continue;
3734 
3735       if (IsUniformMemOpUse(&I))
3736         AddToWorklistIfAllowed(&I);
3737 
3738       if (IsVectorizedMemAccessUse(&I, Ptr))
3739         HasUniformUse.insert(Ptr);
3740     }
3741 
3742   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3743   // demanding) users.  Since loops are assumed to be in LCSSA form, this
3744   // disallows uses outside the loop as well.
3745   for (auto *V : HasUniformUse) {
3746     if (IsOutOfScope(V))
3747       continue;
3748     auto *I = cast<Instruction>(V);
3749     bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3750       auto *UI = cast<Instruction>(U);
3751       return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3752     });
3753     if (UsersAreMemAccesses)
3754       AddToWorklistIfAllowed(I);
3755   }
3756 
3757   // Expand Worklist in topological order: whenever a new instruction
3758   // is added , its users should be already inside Worklist.  It ensures
3759   // a uniform instruction will only be used by uniform instructions.
3760   unsigned Idx = 0;
3761   while (Idx != Worklist.size()) {
3762     Instruction *I = Worklist[Idx++];
3763 
3764     for (auto *OV : I->operand_values()) {
3765       // isOutOfScope operands cannot be uniform instructions.
3766       if (IsOutOfScope(OV))
3767         continue;
3768       // First order recurrence Phi's should typically be considered
3769       // non-uniform.
3770       auto *OP = dyn_cast<PHINode>(OV);
3771       if (OP && Legal->isFixedOrderRecurrence(OP))
3772         continue;
3773       // If all the users of the operand are uniform, then add the
3774       // operand into the uniform worklist.
3775       auto *OI = cast<Instruction>(OV);
3776       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3777             auto *J = cast<Instruction>(U);
3778             return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3779           }))
3780         AddToWorklistIfAllowed(OI);
3781     }
3782   }
3783 
3784   // For an instruction to be added into Worklist above, all its users inside
3785   // the loop should also be in Worklist. However, this condition cannot be
3786   // true for phi nodes that form a cyclic dependence. We must process phi
3787   // nodes separately. An induction variable will remain uniform if all users
3788   // of the induction variable and induction variable update remain uniform.
3789   // The code below handles both pointer and non-pointer induction variables.
3790   BasicBlock *Latch = TheLoop->getLoopLatch();
3791   for (const auto &Induction : Legal->getInductionVars()) {
3792     auto *Ind = Induction.first;
3793     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3794 
3795     // Determine if all users of the induction variable are uniform after
3796     // vectorization.
3797     bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3798       auto *I = cast<Instruction>(U);
3799       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3800              IsVectorizedMemAccessUse(I, Ind);
3801     });
3802     if (!UniformInd)
3803       continue;
3804 
3805     // Determine if all users of the induction variable update instruction are
3806     // uniform after vectorization.
3807     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3808       auto *I = cast<Instruction>(U);
3809       return I == Ind || Worklist.count(I) ||
3810              IsVectorizedMemAccessUse(I, IndUpdate);
3811     });
3812     if (!UniformIndUpdate)
3813       continue;
3814 
3815     // The induction variable and its update instruction will remain uniform.
3816     AddToWorklistIfAllowed(Ind);
3817     AddToWorklistIfAllowed(IndUpdate);
3818   }
3819 
3820   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3821 }
3822 
3823 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3824   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3825 
3826   if (Legal->getRuntimePointerChecking()->Need) {
3827     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3828         "runtime pointer checks needed. Enable vectorization of this "
3829         "loop with '#pragma clang loop vectorize(enable)' when "
3830         "compiling with -Os/-Oz",
3831         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3832     return true;
3833   }
3834 
3835   if (!PSE.getPredicate().isAlwaysTrue()) {
3836     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3837         "runtime SCEV checks needed. Enable vectorization of this "
3838         "loop with '#pragma clang loop vectorize(enable)' when "
3839         "compiling with -Os/-Oz",
3840         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3841     return true;
3842   }
3843 
3844   // FIXME: Avoid specializing for stride==1 instead of bailing out.
3845   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3846     reportVectorizationFailure("Runtime stride check for small trip count",
3847         "runtime stride == 1 checks needed. Enable vectorization of "
3848         "this loop without such check by compiling with -Os/-Oz",
3849         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3850     return true;
3851   }
3852 
3853   return false;
3854 }
3855 
3856 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3857   if (IsScalableVectorizationAllowed)
3858     return *IsScalableVectorizationAllowed;
3859 
3860   IsScalableVectorizationAllowed = false;
3861   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3862     return false;
3863 
3864   if (Hints->isScalableVectorizationDisabled()) {
3865     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3866                             "ScalableVectorizationDisabled", ORE, TheLoop);
3867     return false;
3868   }
3869 
3870   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3871 
3872   auto MaxScalableVF = ElementCount::getScalable(
3873       std::numeric_limits<ElementCount::ScalarTy>::max());
3874 
3875   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3876   // FIXME: While for scalable vectors this is currently sufficient, this should
3877   // be replaced by a more detailed mechanism that filters out specific VFs,
3878   // instead of invalidating vectorization for a whole set of VFs based on the
3879   // MaxVF.
3880 
3881   // Disable scalable vectorization if the loop contains unsupported reductions.
3882   if (!canVectorizeReductions(MaxScalableVF)) {
3883     reportVectorizationInfo(
3884         "Scalable vectorization not supported for the reduction "
3885         "operations found in this loop.",
3886         "ScalableVFUnfeasible", ORE, TheLoop);
3887     return false;
3888   }
3889 
3890   // Disable scalable vectorization if the loop contains any instructions
3891   // with element types not supported for scalable vectors.
3892   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3893         return !Ty->isVoidTy() &&
3894                !this->TTI.isElementTypeLegalForScalableVector(Ty);
3895       })) {
3896     reportVectorizationInfo("Scalable vectorization is not supported "
3897                             "for all element types found in this loop.",
3898                             "ScalableVFUnfeasible", ORE, TheLoop);
3899     return false;
3900   }
3901 
3902   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3903     reportVectorizationInfo("The target does not provide maximum vscale value "
3904                             "for safe distance analysis.",
3905                             "ScalableVFUnfeasible", ORE, TheLoop);
3906     return false;
3907   }
3908 
3909   IsScalableVectorizationAllowed = true;
3910   return true;
3911 }
3912 
3913 ElementCount
3914 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3915   if (!isScalableVectorizationAllowed())
3916     return ElementCount::getScalable(0);
3917 
3918   auto MaxScalableVF = ElementCount::getScalable(
3919       std::numeric_limits<ElementCount::ScalarTy>::max());
3920   if (Legal->isSafeForAnyVectorWidth())
3921     return MaxScalableVF;
3922 
3923   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3924   // Limit MaxScalableVF by the maximum safe dependence distance.
3925   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3926 
3927   if (!MaxScalableVF)
3928     reportVectorizationInfo(
3929         "Max legal vector width too small, scalable vectorization "
3930         "unfeasible.",
3931         "ScalableVFUnfeasible", ORE, TheLoop);
3932 
3933   return MaxScalableVF;
3934 }
3935 
3936 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3937     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3938   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3939   unsigned SmallestType, WidestType;
3940   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3941 
3942   // Get the maximum safe dependence distance in bits computed by LAA.
3943   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3944   // the memory accesses that is most restrictive (involved in the smallest
3945   // dependence distance).
3946   unsigned MaxSafeElements =
3947       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3948 
3949   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3950   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3951   if (!Legal->isSafeForAnyVectorWidth())
3952     this->MaxSafeElements = MaxSafeElements;
3953 
3954   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3955                     << ".\n");
3956   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3957                     << ".\n");
3958 
3959   // First analyze the UserVF, fall back if the UserVF should be ignored.
3960   if (UserVF) {
3961     auto MaxSafeUserVF =
3962         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3963 
3964     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3965       // If `VF=vscale x N` is safe, then so is `VF=N`
3966       if (UserVF.isScalable())
3967         return FixedScalableVFPair(
3968             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3969 
3970       return UserVF;
3971     }
3972 
3973     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3974 
3975     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3976     // is better to ignore the hint and let the compiler choose a suitable VF.
3977     if (!UserVF.isScalable()) {
3978       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3979                         << " is unsafe, clamping to max safe VF="
3980                         << MaxSafeFixedVF << ".\n");
3981       ORE->emit([&]() {
3982         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3983                                           TheLoop->getStartLoc(),
3984                                           TheLoop->getHeader())
3985                << "User-specified vectorization factor "
3986                << ore::NV("UserVectorizationFactor", UserVF)
3987                << " is unsafe, clamping to maximum safe vectorization factor "
3988                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3989       });
3990       return MaxSafeFixedVF;
3991     }
3992 
3993     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3994       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3995                         << " is ignored because scalable vectors are not "
3996                            "available.\n");
3997       ORE->emit([&]() {
3998         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3999                                           TheLoop->getStartLoc(),
4000                                           TheLoop->getHeader())
4001                << "User-specified vectorization factor "
4002                << ore::NV("UserVectorizationFactor", UserVF)
4003                << " is ignored because the target does not support scalable "
4004                   "vectors. The compiler will pick a more suitable value.";
4005       });
4006     } else {
4007       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4008                         << " is unsafe. Ignoring scalable UserVF.\n");
4009       ORE->emit([&]() {
4010         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4011                                           TheLoop->getStartLoc(),
4012                                           TheLoop->getHeader())
4013                << "User-specified vectorization factor "
4014                << ore::NV("UserVectorizationFactor", UserVF)
4015                << " is unsafe. Ignoring the hint to let the compiler pick a "
4016                   "more suitable value.";
4017       });
4018     }
4019   }
4020 
4021   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4022                     << " / " << WidestType << " bits.\n");
4023 
4024   FixedScalableVFPair Result(ElementCount::getFixed(1),
4025                              ElementCount::getScalable(0));
4026   if (auto MaxVF =
4027           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4028                                   MaxSafeFixedVF, FoldTailByMasking))
4029     Result.FixedVF = MaxVF;
4030 
4031   if (auto MaxVF =
4032           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4033                                   MaxSafeScalableVF, FoldTailByMasking))
4034     if (MaxVF.isScalable()) {
4035       Result.ScalableVF = MaxVF;
4036       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4037                         << "\n");
4038     }
4039 
4040   return Result;
4041 }
4042 
4043 FixedScalableVFPair
4044 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4045   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4046     // TODO: It may be useful to do since it's still likely to be dynamically
4047     // uniform if the target can skip.
4048     reportVectorizationFailure(
4049         "Not inserting runtime ptr check for divergent target",
4050         "runtime pointer checks needed. Not enabled for divergent target",
4051         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4052     return FixedScalableVFPair::getNone();
4053   }
4054 
4055   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4056   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
4057   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4058   if (TC != MaxTC)
4059     LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
4060   if (TC == 1) {
4061     reportVectorizationFailure("Single iteration (non) loop",
4062         "loop trip count is one, irrelevant for vectorization",
4063         "SingleIterationLoop", ORE, TheLoop);
4064     return FixedScalableVFPair::getNone();
4065   }
4066 
4067   switch (ScalarEpilogueStatus) {
4068   case CM_ScalarEpilogueAllowed:
4069     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4070   case CM_ScalarEpilogueNotAllowedUsePredicate:
4071     [[fallthrough]];
4072   case CM_ScalarEpilogueNotNeededUsePredicate:
4073     LLVM_DEBUG(
4074         dbgs() << "LV: vector predicate hint/switch found.\n"
4075                << "LV: Not allowing scalar epilogue, creating predicated "
4076                << "vector loop.\n");
4077     break;
4078   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4079     // fallthrough as a special case of OptForSize
4080   case CM_ScalarEpilogueNotAllowedOptSize:
4081     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4082       LLVM_DEBUG(
4083           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4084     else
4085       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4086                         << "count.\n");
4087 
4088     // Bail if runtime checks are required, which are not good when optimising
4089     // for size.
4090     if (runtimeChecksRequired())
4091       return FixedScalableVFPair::getNone();
4092 
4093     break;
4094   }
4095 
4096   // The only loops we can vectorize without a scalar epilogue, are loops with
4097   // a bottom-test and a single exiting block. We'd have to handle the fact
4098   // that not every instruction executes on the last iteration.  This will
4099   // require a lane mask which varies through the vector loop body.  (TODO)
4100   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4101     // If there was a tail-folding hint/switch, but we can't fold the tail by
4102     // masking, fallback to a vectorization with a scalar epilogue.
4103     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4104       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4105                            "scalar epilogue instead.\n");
4106       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4107       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4108     }
4109     return FixedScalableVFPair::getNone();
4110   }
4111 
4112   // Now try the tail folding
4113 
4114   // Invalidate interleave groups that require an epilogue if we can't mask
4115   // the interleave-group.
4116   if (!useMaskedInterleavedAccesses(TTI)) {
4117     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4118            "No decisions should have been taken at this point");
4119     // Note: There is no need to invalidate any cost modeling decisions here, as
4120     // none were taken so far.
4121     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4122   }
4123 
4124   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4125 
4126   // Avoid tail folding if the trip count is known to be a multiple of any VF
4127   // we choose.
4128   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4129       MaxFactors.FixedVF.getFixedValue();
4130   if (MaxFactors.ScalableVF) {
4131     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4132     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4133       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4134           *MaxPowerOf2RuntimeVF,
4135           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4136     } else
4137       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4138   }
4139 
4140   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4141     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4142            "MaxFixedVF must be a power of 2");
4143     unsigned MaxVFtimesIC =
4144         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4145     ScalarEvolution *SE = PSE.getSE();
4146     // Currently only loops with countable exits are vectorized, but calling
4147     // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4148     // uncountable exits whilst also ensuring the symbolic maximum and known
4149     // back-edge taken count remain identical for loops with countable exits.
4150     const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4151     assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4152            "Invalid loop count");
4153     const SCEV *ExitCount = SE->getAddExpr(
4154         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4155     const SCEV *Rem = SE->getURemExpr(
4156         SE->applyLoopGuards(ExitCount, TheLoop),
4157         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4158     if (Rem->isZero()) {
4159       // Accept MaxFixedVF if we do not have a tail.
4160       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4161       return MaxFactors;
4162     }
4163   }
4164 
4165   // If we don't know the precise trip count, or if the trip count that we
4166   // found modulo the vectorization factor is not zero, try to fold the tail
4167   // by masking.
4168   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4169   setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4170   if (foldTailByMasking()) {
4171     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4172       LLVM_DEBUG(
4173           dbgs()
4174           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4175              "try to generate VP Intrinsics with scalable vector "
4176              "factors only.\n");
4177       // Tail folded loop using VP intrinsics restricts the VF to be scalable
4178       // for now.
4179       // TODO: extend it for fixed vectors, if required.
4180       assert(MaxFactors.ScalableVF.isScalable() &&
4181              "Expected scalable vector factor.");
4182 
4183       MaxFactors.FixedVF = ElementCount::getFixed(1);
4184     }
4185     return MaxFactors;
4186   }
4187 
4188   // If there was a tail-folding hint/switch, but we can't fold the tail by
4189   // masking, fallback to a vectorization with a scalar epilogue.
4190   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4191     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4192                          "scalar epilogue instead.\n");
4193     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4194     return MaxFactors;
4195   }
4196 
4197   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4198     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4199     return FixedScalableVFPair::getNone();
4200   }
4201 
4202   if (TC == 0) {
4203     reportVectorizationFailure(
4204         "unable to calculate the loop count due to complex control flow",
4205         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4206     return FixedScalableVFPair::getNone();
4207   }
4208 
4209   reportVectorizationFailure(
4210       "Cannot optimize for size and vectorize at the same time.",
4211       "cannot optimize for size and vectorize at the same time. "
4212       "Enable vectorization of this loop with '#pragma clang loop "
4213       "vectorize(enable)' when compiling with -Os/-Oz",
4214       "NoTailLoopWithOptForSize", ORE, TheLoop);
4215   return FixedScalableVFPair::getNone();
4216 }
4217 
4218 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4219     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4220     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4221   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4222   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4223       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4224                            : TargetTransformInfo::RGK_FixedWidthVector);
4225 
4226   // Convenience function to return the minimum of two ElementCounts.
4227   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4228     assert((LHS.isScalable() == RHS.isScalable()) &&
4229            "Scalable flags must match");
4230     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4231   };
4232 
4233   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4234   // Note that both WidestRegister and WidestType may not be a powers of 2.
4235   auto MaxVectorElementCount = ElementCount::get(
4236       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4237       ComputeScalableMaxVF);
4238   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4239   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4240                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4241 
4242   if (!MaxVectorElementCount) {
4243     LLVM_DEBUG(dbgs() << "LV: The target has no "
4244                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4245                       << " vector registers.\n");
4246     return ElementCount::getFixed(1);
4247   }
4248 
4249   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4250   if (MaxVectorElementCount.isScalable() &&
4251       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4252     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4253     auto Min = Attr.getVScaleRangeMin();
4254     WidestRegisterMinEC *= Min;
4255   }
4256 
4257   // When a scalar epilogue is required, at least one iteration of the scalar
4258   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4259   // max VF that results in a dead vector loop.
4260   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4261     MaxTripCount -= 1;
4262 
4263   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4264       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4265     // If upper bound loop trip count (TC) is known at compile time there is no
4266     // point in choosing VF greater than TC (as done in the loop below). Select
4267     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4268     // scalable, we only fall back on a fixed VF when the TC is less than or
4269     // equal to the known number of lanes.
4270     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4271     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4272                          "exceeding the constant trip count: "
4273                       << ClampedUpperTripCount << "\n");
4274     return ElementCount::get(
4275         ClampedUpperTripCount,
4276         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4277   }
4278 
4279   TargetTransformInfo::RegisterKind RegKind =
4280       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4281                            : TargetTransformInfo::RGK_FixedWidthVector;
4282   ElementCount MaxVF = MaxVectorElementCount;
4283   if (MaximizeBandwidth ||
4284       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4285        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4286         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4287     auto MaxVectorElementCountMaxBW = ElementCount::get(
4288         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4289         ComputeScalableMaxVF);
4290     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4291 
4292     // Collect all viable vectorization factors larger than the default MaxVF
4293     // (i.e. MaxVectorElementCount).
4294     SmallVector<ElementCount, 8> VFs;
4295     for (ElementCount VS = MaxVectorElementCount * 2;
4296          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4297       VFs.push_back(VS);
4298 
4299     // For each VF calculate its register usage.
4300     auto RUs = calculateRegisterUsage(VFs);
4301 
4302     // Select the largest VF which doesn't require more registers than existing
4303     // ones.
4304     for (int I = RUs.size() - 1; I >= 0; --I) {
4305       const auto &MLU = RUs[I].MaxLocalUsers;
4306       if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4307             return LU.second <= TTI.getNumberOfRegisters(LU.first);
4308           })) {
4309         MaxVF = VFs[I];
4310         break;
4311       }
4312     }
4313     if (ElementCount MinVF =
4314             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4315       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4316         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4317                           << ") with target's minimum: " << MinVF << '\n');
4318         MaxVF = MinVF;
4319       }
4320     }
4321 
4322     // Invalidate any widening decisions we might have made, in case the loop
4323     // requires prediction (decided later), but we have already made some
4324     // load/store widening decisions.
4325     invalidateCostModelingDecisions();
4326   }
4327   return MaxVF;
4328 }
4329 
4330 /// Convenience function that returns the value of vscale_range iff
4331 /// vscale_range.min == vscale_range.max or otherwise returns the value
4332 /// returned by the corresponding TTI method.
4333 static std::optional<unsigned>
4334 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4335   const Function *Fn = L->getHeader()->getParent();
4336   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4337     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4338     auto Min = Attr.getVScaleRangeMin();
4339     auto Max = Attr.getVScaleRangeMax();
4340     if (Max && Min == Max)
4341       return Max;
4342   }
4343 
4344   return TTI.getVScaleForTuning();
4345 }
4346 
4347 /// This function attempts to return a value that represents the vectorization
4348 /// factor at runtime. For fixed-width VFs we know this precisely at compile
4349 /// time, but for scalable VFs we calculate it based on an estimate of the
4350 /// vscale value.
4351 static unsigned getEstimatedRuntimeVF(const Loop *L,
4352                                       const TargetTransformInfo &TTI,
4353                                       ElementCount VF) {
4354   unsigned EstimatedVF = VF.getKnownMinValue();
4355   if (VF.isScalable())
4356     if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4357       EstimatedVF *= *VScale;
4358   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4359   return EstimatedVF;
4360 }
4361 
4362 bool LoopVectorizationPlanner::isMoreProfitable(
4363     const VectorizationFactor &A, const VectorizationFactor &B,
4364     const unsigned MaxTripCount) const {
4365   InstructionCost CostA = A.Cost;
4366   InstructionCost CostB = B.Cost;
4367 
4368   // Improve estimate for the vector width if it is scalable.
4369   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4370   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4371   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4372     if (A.Width.isScalable())
4373       EstimatedWidthA *= *VScale;
4374     if (B.Width.isScalable())
4375       EstimatedWidthB *= *VScale;
4376   }
4377 
4378   // Assume vscale may be larger than 1 (or the value being tuned for),
4379   // so that scalable vectorization is slightly favorable over fixed-width
4380   // vectorization.
4381   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4382                         A.Width.isScalable() && !B.Width.isScalable();
4383 
4384   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4385                                 const InstructionCost &RHS) {
4386     return PreferScalable ? LHS <= RHS : LHS < RHS;
4387   };
4388 
4389   // To avoid the need for FP division:
4390   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4391   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4392   if (!MaxTripCount)
4393     return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4394 
4395   auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4396                                            InstructionCost VectorCost,
4397                                            InstructionCost ScalarCost) {
4398     // If the trip count is a known (possibly small) constant, the trip count
4399     // will be rounded up to an integer number of iterations under
4400     // FoldTailByMasking. The total cost in that case will be
4401     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4402     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4403     // some extra overheads, but for the purpose of comparing the costs of
4404     // different VFs we can use this to compare the total loop-body cost
4405     // expected after vectorization.
4406     if (CM.foldTailByMasking())
4407       return VectorCost * divideCeil(MaxTripCount, VF);
4408     return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4409   };
4410 
4411   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4412   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4413   return CmpFn(RTCostA, RTCostB);
4414 }
4415 
4416 bool LoopVectorizationPlanner::isMoreProfitable(
4417     const VectorizationFactor &A, const VectorizationFactor &B) const {
4418   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4419   return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4420 }
4421 
4422 void LoopVectorizationPlanner::emitInvalidCostRemarks(
4423     OptimizationRemarkEmitter *ORE) {
4424   using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4425   SmallVector<RecipeVFPair> InvalidCosts;
4426   for (const auto &Plan : VPlans) {
4427     for (ElementCount VF : Plan->vectorFactors()) {
4428       VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4429                             CM);
4430       precomputeCosts(*Plan, VF, CostCtx);
4431       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4432       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4433         for (auto &R : *VPBB) {
4434           if (!R.cost(VF, CostCtx).isValid())
4435             InvalidCosts.emplace_back(&R, VF);
4436         }
4437       }
4438     }
4439   }
4440   if (InvalidCosts.empty())
4441     return;
4442 
4443   // Emit a report of VFs with invalid costs in the loop.
4444 
4445   // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4446   DenseMap<VPRecipeBase *, unsigned> Numbering;
4447   unsigned I = 0;
4448   for (auto &Pair : InvalidCosts)
4449     if (!Numbering.count(Pair.first))
4450       Numbering[Pair.first] = I++;
4451 
4452   // Sort the list, first on recipe(number) then on VF.
4453   sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4454     if (Numbering[A.first] != Numbering[B.first])
4455       return Numbering[A.first] < Numbering[B.first];
4456     const auto &LHS = A.second;
4457     const auto &RHS = B.second;
4458     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4459            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4460   });
4461 
4462   // For a list of ordered recipe-VF pairs:
4463   //   [(load, VF1), (load, VF2), (store, VF1)]
4464   // group the recipes together to emit separate remarks for:
4465   //   load  (VF1, VF2)
4466   //   store (VF1)
4467   auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4468   auto Subset = ArrayRef<RecipeVFPair>();
4469   do {
4470     if (Subset.empty())
4471       Subset = Tail.take_front(1);
4472 
4473     VPRecipeBase *R = Subset.front().first;
4474 
4475     unsigned Opcode =
4476         TypeSwitch<const VPRecipeBase *, unsigned>(R)
4477             .Case<VPHeaderPHIRecipe>(
4478                 [](const auto *R) { return Instruction::PHI; })
4479             .Case<VPWidenSelectRecipe>(
4480                 [](const auto *R) { return Instruction::Select; })
4481             .Case<VPWidenStoreRecipe>(
4482                 [](const auto *R) { return Instruction::Store; })
4483             .Case<VPWidenLoadRecipe>(
4484                 [](const auto *R) { return Instruction::Load; })
4485             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4486                 [](const auto *R) { return Instruction::Call; })
4487             .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4488                   VPWidenCastRecipe>(
4489                 [](const auto *R) { return R->getOpcode(); })
4490             .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4491               return R->getStoredValues().empty() ? Instruction::Load
4492                                                   : Instruction::Store;
4493             });
4494 
4495     // If the next recipe is different, or if there are no other pairs,
4496     // emit a remark for the collated subset. e.g.
4497     //   [(load, VF1), (load, VF2))]
4498     // to emit:
4499     //  remark: invalid costs for 'load' at VF=(VF1, VF2)
4500     if (Subset == Tail || Tail[Subset.size()].first != R) {
4501       std::string OutString;
4502       raw_string_ostream OS(OutString);
4503       assert(!Subset.empty() && "Unexpected empty range");
4504       OS << "Recipe with invalid costs prevented vectorization at VF=(";
4505       for (const auto &Pair : Subset)
4506         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4507       OS << "):";
4508       if (Opcode == Instruction::Call) {
4509         StringRef Name = "";
4510         if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4511           Name = Int->getIntrinsicName();
4512         } else {
4513           auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4514           Function *CalledFn =
4515               WidenCall ? WidenCall->getCalledScalarFunction()
4516                         : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4517                                              ->getLiveInIRValue());
4518           Name = CalledFn->getName();
4519         }
4520         OS << " call to " << Name;
4521       } else
4522         OS << " " << Instruction::getOpcodeName(Opcode);
4523       reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4524                               R->getDebugLoc());
4525       Tail = Tail.drop_front(Subset.size());
4526       Subset = {};
4527     } else
4528       // Grow the subset by one element
4529       Subset = Tail.take_front(Subset.size() + 1);
4530   } while (!Tail.empty());
4531 }
4532 
4533 /// Check if any recipe of \p Plan will generate a vector value, which will be
4534 /// assigned a vector register.
4535 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4536                                 const TargetTransformInfo &TTI) {
4537   assert(VF.isVector() && "Checking a scalar VF?");
4538   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4539   DenseSet<VPRecipeBase *> EphemeralRecipes;
4540   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4541   // Set of already visited types.
4542   DenseSet<Type *> Visited;
4543   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4544            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4545     for (VPRecipeBase &R : *VPBB) {
4546       if (EphemeralRecipes.contains(&R))
4547         continue;
4548       // Continue early if the recipe is considered to not produce a vector
4549       // result. Note that this includes VPInstruction where some opcodes may
4550       // produce a vector, to preserve existing behavior as VPInstructions model
4551       // aspects not directly mapped to existing IR instructions.
4552       switch (R.getVPDefID()) {
4553       case VPDef::VPDerivedIVSC:
4554       case VPDef::VPScalarIVStepsSC:
4555       case VPDef::VPScalarCastSC:
4556       case VPDef::VPReplicateSC:
4557       case VPDef::VPInstructionSC:
4558       case VPDef::VPCanonicalIVPHISC:
4559       case VPDef::VPVectorPointerSC:
4560       case VPDef::VPReverseVectorPointerSC:
4561       case VPDef::VPExpandSCEVSC:
4562       case VPDef::VPEVLBasedIVPHISC:
4563       case VPDef::VPPredInstPHISC:
4564       case VPDef::VPBranchOnMaskSC:
4565         continue;
4566       case VPDef::VPReductionSC:
4567       case VPDef::VPActiveLaneMaskPHISC:
4568       case VPDef::VPWidenCallSC:
4569       case VPDef::VPWidenCanonicalIVSC:
4570       case VPDef::VPWidenCastSC:
4571       case VPDef::VPWidenGEPSC:
4572       case VPDef::VPWidenIntrinsicSC:
4573       case VPDef::VPWidenSC:
4574       case VPDef::VPWidenSelectSC:
4575       case VPDef::VPBlendSC:
4576       case VPDef::VPFirstOrderRecurrencePHISC:
4577       case VPDef::VPWidenPHISC:
4578       case VPDef::VPWidenIntOrFpInductionSC:
4579       case VPDef::VPWidenPointerInductionSC:
4580       case VPDef::VPReductionPHISC:
4581       case VPDef::VPInterleaveSC:
4582       case VPDef::VPWidenLoadEVLSC:
4583       case VPDef::VPWidenLoadSC:
4584       case VPDef::VPWidenStoreEVLSC:
4585       case VPDef::VPWidenStoreSC:
4586         break;
4587       default:
4588         llvm_unreachable("unhandled recipe");
4589       }
4590 
4591       auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4592         Type *VectorTy = toVectorTy(ScalarTy, VF);
4593         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4594         if (!NumLegalParts)
4595           return false;
4596         if (VF.isScalable()) {
4597           // <vscale x 1 x iN> is assumed to be profitable over iN because
4598           // scalable registers are a distinct register class from scalar
4599           // ones. If we ever find a target which wants to lower scalable
4600           // vectors back to scalars, we'll need to update this code to
4601           // explicitly ask TTI about the register class uses for each part.
4602           return NumLegalParts <= VF.getKnownMinValue();
4603         }
4604         // Two or more parts that share a register - are vectorized.
4605         return NumLegalParts < VF.getKnownMinValue();
4606       };
4607 
4608       // If no def nor is a store, e.g., branches, continue - no value to check.
4609       if (R.getNumDefinedValues() == 0 &&
4610           !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4611               &R))
4612         continue;
4613       // For multi-def recipes, currently only interleaved loads, suffice to
4614       // check first def only.
4615       // For stores check their stored value; for interleaved stores suffice
4616       // the check first stored value only. In all cases this is the second
4617       // operand.
4618       VPValue *ToCheck =
4619           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4620       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4621       if (!Visited.insert({ScalarTy}).second)
4622         continue;
4623       if (WillWiden(ScalarTy))
4624         return true;
4625     }
4626   }
4627 
4628   return false;
4629 }
4630 
4631 #ifndef NDEBUG
4632 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4633   InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4634   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4635   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4636   assert(any_of(VPlans,
4637                 [](std::unique_ptr<VPlan> &P) {
4638                   return P->hasVF(ElementCount::getFixed(1));
4639                 }) &&
4640          "Expected Scalar VF to be a candidate");
4641 
4642   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4643                                        ExpectedCost);
4644   VectorizationFactor ChosenFactor = ScalarCost;
4645 
4646   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4647   if (ForceVectorization &&
4648       (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4649     // Ignore scalar width, because the user explicitly wants vectorization.
4650     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4651     // evaluation.
4652     ChosenFactor.Cost = InstructionCost::getMax();
4653   }
4654 
4655   for (auto &P : VPlans) {
4656     for (ElementCount VF : P->vectorFactors()) {
4657       // The cost for scalar VF=1 is already calculated, so ignore it.
4658       if (VF.isScalar())
4659         continue;
4660 
4661       InstructionCost C = CM.expectedCost(VF);
4662       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4663 
4664       unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4665       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4666                         << " costs: " << (Candidate.Cost / Width));
4667       if (VF.isScalable())
4668         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4669                           << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4670                           << ")");
4671       LLVM_DEBUG(dbgs() << ".\n");
4672 
4673       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4674         LLVM_DEBUG(
4675             dbgs()
4676             << "LV: Not considering vector loop of width " << VF
4677             << " because it will not generate any vector instructions.\n");
4678         continue;
4679       }
4680 
4681       if (isMoreProfitable(Candidate, ChosenFactor))
4682         ChosenFactor = Candidate;
4683     }
4684   }
4685 
4686   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4687     reportVectorizationFailure(
4688         "There are conditional stores.",
4689         "store that is conditionally executed prevents vectorization",
4690         "ConditionalStore", ORE, OrigLoop);
4691     ChosenFactor = ScalarCost;
4692   }
4693 
4694   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4695                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4696              << "LV: Vectorization seems to be not beneficial, "
4697              << "but was forced by a user.\n");
4698   return ChosenFactor;
4699 }
4700 #endif
4701 
4702 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4703     ElementCount VF) const {
4704   // Cross iteration phis such as reductions need special handling and are
4705   // currently unsupported.
4706   if (any_of(OrigLoop->getHeader()->phis(),
4707              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4708     return false;
4709 
4710   // Phis with uses outside of the loop require special handling and are
4711   // currently unsupported.
4712   for (const auto &Entry : Legal->getInductionVars()) {
4713     // Look for uses of the value of the induction at the last iteration.
4714     Value *PostInc =
4715         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4716     for (User *U : PostInc->users())
4717       if (!OrigLoop->contains(cast<Instruction>(U)))
4718         return false;
4719     // Look for uses of penultimate value of the induction.
4720     for (User *U : Entry.first->users())
4721       if (!OrigLoop->contains(cast<Instruction>(U)))
4722         return false;
4723   }
4724 
4725   // Epilogue vectorization code has not been auditted to ensure it handles
4726   // non-latch exits properly.  It may be fine, but it needs auditted and
4727   // tested.
4728   // TODO: Add support for loops with an early exit.
4729   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4730     return false;
4731 
4732   return true;
4733 }
4734 
4735 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4736     const ElementCount VF, const unsigned IC) const {
4737   // FIXME: We need a much better cost-model to take different parameters such
4738   // as register pressure, code size increase and cost of extra branches into
4739   // account. For now we apply a very crude heuristic and only consider loops
4740   // with vectorization factors larger than a certain value.
4741 
4742   // Allow the target to opt out entirely.
4743   if (!TTI.preferEpilogueVectorization())
4744     return false;
4745 
4746   // We also consider epilogue vectorization unprofitable for targets that don't
4747   // consider interleaving beneficial (eg. MVE).
4748   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4749     return false;
4750 
4751   // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4752   // VFs when deciding profitability.
4753   // See related "TODO: extend to support scalable VFs." in
4754   // selectEpilogueVectorizationFactor.
4755   unsigned Multiplier = VF.isFixed() ? IC : 1;
4756   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4757                                 ? EpilogueVectorizationMinVF
4758                                 : TTI.getEpilogueVectorizationMinVF();
4759   return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4760 }
4761 
4762 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4763     const ElementCount MainLoopVF, unsigned IC) {
4764   VectorizationFactor Result = VectorizationFactor::Disabled();
4765   if (!EnableEpilogueVectorization) {
4766     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4767     return Result;
4768   }
4769 
4770   if (!CM.isScalarEpilogueAllowed()) {
4771     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4772                          "epilogue is allowed.\n");
4773     return Result;
4774   }
4775 
4776   // Not really a cost consideration, but check for unsupported cases here to
4777   // simplify the logic.
4778   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4779     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4780                          "is not a supported candidate.\n");
4781     return Result;
4782   }
4783 
4784   if (EpilogueVectorizationForceVF > 1) {
4785     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4786     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4787     if (hasPlanWithVF(ForcedEC))
4788       return {ForcedEC, 0, 0};
4789 
4790     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4791                          "viable.\n");
4792     return Result;
4793   }
4794 
4795   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4796       OrigLoop->getHeader()->getParent()->hasMinSize()) {
4797     LLVM_DEBUG(
4798         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4799     return Result;
4800   }
4801 
4802   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4803     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4804                          "this loop\n");
4805     return Result;
4806   }
4807 
4808   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4809   // the main loop handles 8 lanes per iteration. We could still benefit from
4810   // vectorizing the epilogue loop with VF=4.
4811   ElementCount EstimatedRuntimeVF =
4812       ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4813 
4814   ScalarEvolution &SE = *PSE.getSE();
4815   Type *TCType = Legal->getWidestInductionType();
4816   const SCEV *RemainingIterations = nullptr;
4817   unsigned MaxTripCount = 0;
4818   for (auto &NextVF : ProfitableVFs) {
4819     // Skip candidate VFs without a corresponding VPlan.
4820     if (!hasPlanWithVF(NextVF.Width))
4821       continue;
4822 
4823     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4824     // vectors) or > the VF of the main loop (fixed vectors).
4825     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4826          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4827         (NextVF.Width.isScalable() &&
4828          ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4829         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4830          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4831       continue;
4832 
4833     // If NextVF is greater than the number of remaining iterations, the
4834     // epilogue loop would be dead. Skip such factors.
4835     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4836       // TODO: extend to support scalable VFs.
4837       if (!RemainingIterations) {
4838         const SCEV *TC = vputils::getSCEVExprForVPValue(
4839             getPlanFor(NextVF.Width).getTripCount(), SE);
4840         assert(!isa<SCEVCouldNotCompute>(TC) &&
4841                "Trip count SCEV must be computable");
4842         RemainingIterations = SE.getURemExpr(
4843             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4844         MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4845         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4846                                 SE.getConstant(TCType, MaxTripCount))) {
4847           MaxTripCount =
4848               SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4849         }
4850         LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4851                           << MaxTripCount << "\n");
4852       }
4853       if (SE.isKnownPredicate(
4854               CmpInst::ICMP_UGT,
4855               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4856               RemainingIterations))
4857         continue;
4858     }
4859 
4860     if (Result.Width.isScalar() ||
4861         isMoreProfitable(NextVF, Result, MaxTripCount))
4862       Result = NextVF;
4863   }
4864 
4865   if (Result != VectorizationFactor::Disabled())
4866     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4867                       << Result.Width << "\n");
4868   return Result;
4869 }
4870 
4871 std::pair<unsigned, unsigned>
4872 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4873   unsigned MinWidth = -1U;
4874   unsigned MaxWidth = 8;
4875   const DataLayout &DL = TheFunction->getDataLayout();
4876   // For in-loop reductions, no element types are added to ElementTypesInLoop
4877   // if there are no loads/stores in the loop. In this case, check through the
4878   // reduction variables to determine the maximum width.
4879   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4880     // Reset MaxWidth so that we can find the smallest type used by recurrences
4881     // in the loop.
4882     MaxWidth = -1U;
4883     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4884       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4885       // When finding the min width used by the recurrence we need to account
4886       // for casts on the input operands of the recurrence.
4887       MaxWidth = std::min<unsigned>(
4888           MaxWidth, std::min<unsigned>(
4889                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4890                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4891     }
4892   } else {
4893     for (Type *T : ElementTypesInLoop) {
4894       MinWidth = std::min<unsigned>(
4895           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4896       MaxWidth = std::max<unsigned>(
4897           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4898     }
4899   }
4900   return {MinWidth, MaxWidth};
4901 }
4902 
4903 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4904   ElementTypesInLoop.clear();
4905   // For each block.
4906   for (BasicBlock *BB : TheLoop->blocks()) {
4907     // For each instruction in the loop.
4908     for (Instruction &I : BB->instructionsWithoutDebug()) {
4909       Type *T = I.getType();
4910 
4911       // Skip ignored values.
4912       if (ValuesToIgnore.count(&I))
4913         continue;
4914 
4915       // Only examine Loads, Stores and PHINodes.
4916       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4917         continue;
4918 
4919       // Examine PHI nodes that are reduction variables. Update the type to
4920       // account for the recurrence type.
4921       if (auto *PN = dyn_cast<PHINode>(&I)) {
4922         if (!Legal->isReductionVariable(PN))
4923           continue;
4924         const RecurrenceDescriptor &RdxDesc =
4925             Legal->getReductionVars().find(PN)->second;
4926         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4927             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
4928                                       RdxDesc.getRecurrenceType(),
4929                                       TargetTransformInfo::ReductionFlags()))
4930           continue;
4931         T = RdxDesc.getRecurrenceType();
4932       }
4933 
4934       // Examine the stored values.
4935       if (auto *ST = dyn_cast<StoreInst>(&I))
4936         T = ST->getValueOperand()->getType();
4937 
4938       assert(T->isSized() &&
4939              "Expected the load/store/recurrence type to be sized");
4940 
4941       ElementTypesInLoop.insert(T);
4942     }
4943   }
4944 }
4945 
4946 unsigned
4947 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4948                                                   InstructionCost LoopCost) {
4949   // -- The interleave heuristics --
4950   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4951   // There are many micro-architectural considerations that we can't predict
4952   // at this level. For example, frontend pressure (on decode or fetch) due to
4953   // code size, or the number and capabilities of the execution ports.
4954   //
4955   // We use the following heuristics to select the interleave count:
4956   // 1. If the code has reductions, then we interleave to break the cross
4957   // iteration dependency.
4958   // 2. If the loop is really small, then we interleave to reduce the loop
4959   // overhead.
4960   // 3. We don't interleave if we think that we will spill registers to memory
4961   // due to the increased register pressure.
4962 
4963   if (!isScalarEpilogueAllowed())
4964     return 1;
4965 
4966   // Do not interleave if EVL is preferred and no User IC is specified.
4967   if (foldTailWithEVL()) {
4968     LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4969                          "Unroll factor forced to be 1.\n");
4970     return 1;
4971   }
4972 
4973   // We used the distance for the interleave count.
4974   if (!Legal->isSafeForAnyVectorWidth())
4975     return 1;
4976 
4977   // We don't attempt to perform interleaving for loops with uncountable early
4978   // exits because the VPInstruction::AnyOf code cannot currently handle
4979   // multiple parts.
4980   if (Legal->hasUncountableEarlyExit())
4981     return 1;
4982 
4983   auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4984   const bool HasReductions = !Legal->getReductionVars().empty();
4985 
4986   // If we did not calculate the cost for VF (because the user selected the VF)
4987   // then we calculate the cost of VF here.
4988   if (LoopCost == 0) {
4989     LoopCost = expectedCost(VF);
4990     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4991 
4992     // Loop body is free and there is no need for interleaving.
4993     if (LoopCost == 0)
4994       return 1;
4995   }
4996 
4997   RegisterUsage R = calculateRegisterUsage({VF})[0];
4998   // We divide by these constants so assume that we have at least one
4999   // instruction that uses at least one register.
5000   for (auto &Pair : R.MaxLocalUsers) {
5001     Pair.second = std::max(Pair.second, 1U);
5002   }
5003 
5004   // We calculate the interleave count using the following formula.
5005   // Subtract the number of loop invariants from the number of available
5006   // registers. These registers are used by all of the interleaved instances.
5007   // Next, divide the remaining registers by the number of registers that is
5008   // required by the loop, in order to estimate how many parallel instances
5009   // fit without causing spills. All of this is rounded down if necessary to be
5010   // a power of two. We want power of two interleave count to simplify any
5011   // addressing operations or alignment considerations.
5012   // We also want power of two interleave counts to ensure that the induction
5013   // variable of the vector loop wraps to zero, when tail is folded by masking;
5014   // this currently happens when OptForSize, in which case IC is set to 1 above.
5015   unsigned IC = UINT_MAX;
5016 
5017   for (const auto &Pair : R.MaxLocalUsers) {
5018     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
5019     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5020                       << " registers of "
5021                       << TTI.getRegisterClassName(Pair.first)
5022                       << " register class\n");
5023     if (VF.isScalar()) {
5024       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5025         TargetNumRegisters = ForceTargetNumScalarRegs;
5026     } else {
5027       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5028         TargetNumRegisters = ForceTargetNumVectorRegs;
5029     }
5030     unsigned MaxLocalUsers = Pair.second;
5031     unsigned LoopInvariantRegs = 0;
5032     if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
5033       LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
5034 
5035     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5036                                      MaxLocalUsers);
5037     // Don't count the induction variable as interleaved.
5038     if (EnableIndVarRegisterHeur) {
5039       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5040                               std::max(1U, (MaxLocalUsers - 1)));
5041     }
5042 
5043     IC = std::min(IC, TmpIC);
5044   }
5045 
5046   // Clamp the interleave ranges to reasonable counts.
5047   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5048 
5049   // Check if the user has overridden the max.
5050   if (VF.isScalar()) {
5051     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5052       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5053   } else {
5054     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5055       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5056   }
5057 
5058   unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
5059   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5060   if (KnownTC > 0) {
5061     // At least one iteration must be scalar when this constraint holds. So the
5062     // maximum available iterations for interleaving is one less.
5063     unsigned AvailableTC =
5064         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5065 
5066     // If trip count is known we select between two prospective ICs, where
5067     // 1) the aggressive IC is capped by the trip count divided by VF
5068     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5069     // The final IC is selected in a way that the epilogue loop trip count is
5070     // minimized while maximizing the IC itself, so that we either run the
5071     // vector loop at least once if it generates a small epilogue loop, or else
5072     // we run the vector loop at least twice.
5073 
5074     unsigned InterleaveCountUB = bit_floor(
5075         std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5076     unsigned InterleaveCountLB = bit_floor(std::max(
5077         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5078     MaxInterleaveCount = InterleaveCountLB;
5079 
5080     if (InterleaveCountUB != InterleaveCountLB) {
5081       unsigned TailTripCountUB =
5082           (AvailableTC % (EstimatedVF * InterleaveCountUB));
5083       unsigned TailTripCountLB =
5084           (AvailableTC % (EstimatedVF * InterleaveCountLB));
5085       // If both produce same scalar tail, maximize the IC to do the same work
5086       // in fewer vector loop iterations
5087       if (TailTripCountUB == TailTripCountLB)
5088         MaxInterleaveCount = InterleaveCountUB;
5089     }
5090   } else if (BestKnownTC && *BestKnownTC > 0) {
5091     // At least one iteration must be scalar when this constraint holds. So the
5092     // maximum available iterations for interleaving is one less.
5093     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5094                                ? (*BestKnownTC) - 1
5095                                : *BestKnownTC;
5096 
5097     // If trip count is an estimated compile time constant, limit the
5098     // IC to be capped by the trip count divided by VF * 2, such that the vector
5099     // loop runs at least twice to make interleaving seem profitable when there
5100     // is an epilogue loop present. Since exact Trip count is not known we
5101     // choose to be conservative in our IC estimate.
5102     MaxInterleaveCount = bit_floor(std::max(
5103         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5104   }
5105 
5106   assert(MaxInterleaveCount > 0 &&
5107          "Maximum interleave count must be greater than 0");
5108 
5109   // Clamp the calculated IC to be between the 1 and the max interleave count
5110   // that the target and trip count allows.
5111   if (IC > MaxInterleaveCount)
5112     IC = MaxInterleaveCount;
5113   else
5114     // Make sure IC is greater than 0.
5115     IC = std::max(1u, IC);
5116 
5117   assert(IC > 0 && "Interleave count must be greater than 0.");
5118 
5119   // Interleave if we vectorized this loop and there is a reduction that could
5120   // benefit from interleaving.
5121   if (VF.isVector() && HasReductions) {
5122     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5123     return IC;
5124   }
5125 
5126   // For any scalar loop that either requires runtime checks or predication we
5127   // are better off leaving this to the unroller. Note that if we've already
5128   // vectorized the loop we will have done the runtime check and so interleaving
5129   // won't require further checks.
5130   bool ScalarInterleavingRequiresPredication =
5131       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5132          return Legal->blockNeedsPredication(BB);
5133        }));
5134   bool ScalarInterleavingRequiresRuntimePointerCheck =
5135       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5136 
5137   // We want to interleave small loops in order to reduce the loop overhead and
5138   // potentially expose ILP opportunities.
5139   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5140                     << "LV: IC is " << IC << '\n'
5141                     << "LV: VF is " << VF << '\n');
5142   const bool AggressivelyInterleaveReductions =
5143       TTI.enableAggressiveInterleaving(HasReductions);
5144   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5145       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5146     // We assume that the cost overhead is 1 and we use the cost model
5147     // to estimate the cost of the loop and interleave until the cost of the
5148     // loop overhead is about 5% of the cost of the loop.
5149     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5150                                         SmallLoopCost / *LoopCost.getValue()));
5151 
5152     // Interleave until store/load ports (estimated by max interleave count) are
5153     // saturated.
5154     unsigned NumStores = Legal->getNumStores();
5155     unsigned NumLoads = Legal->getNumLoads();
5156     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5157     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5158 
5159     // There is little point in interleaving for reductions containing selects
5160     // and compares when VF=1 since it may just create more overhead than it's
5161     // worth for loops with small trip counts. This is because we still have to
5162     // do the final reduction after the loop.
5163     bool HasSelectCmpReductions =
5164         HasReductions &&
5165         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5166           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5167           RecurKind RK = RdxDesc.getRecurrenceKind();
5168           return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5169                  RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5170         });
5171     if (HasSelectCmpReductions) {
5172       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5173       return 1;
5174     }
5175 
5176     // If we have a scalar reduction (vector reductions are already dealt with
5177     // by this point), we can increase the critical path length if the loop
5178     // we're interleaving is inside another loop. For tree-wise reductions
5179     // set the limit to 2, and for ordered reductions it's best to disable
5180     // interleaving entirely.
5181     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5182       bool HasOrderedReductions =
5183           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5184             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5185             return RdxDesc.isOrdered();
5186           });
5187       if (HasOrderedReductions) {
5188         LLVM_DEBUG(
5189             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5190         return 1;
5191       }
5192 
5193       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5194       SmallIC = std::min(SmallIC, F);
5195       StoresIC = std::min(StoresIC, F);
5196       LoadsIC = std::min(LoadsIC, F);
5197     }
5198 
5199     if (EnableLoadStoreRuntimeInterleave &&
5200         std::max(StoresIC, LoadsIC) > SmallIC) {
5201       LLVM_DEBUG(
5202           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5203       return std::max(StoresIC, LoadsIC);
5204     }
5205 
5206     // If there are scalar reductions and TTI has enabled aggressive
5207     // interleaving for reductions, we will interleave to expose ILP.
5208     if (VF.isScalar() && AggressivelyInterleaveReductions) {
5209       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5210       // Interleave no less than SmallIC but not as aggressive as the normal IC
5211       // to satisfy the rare situation when resources are too limited.
5212       return std::max(IC / 2, SmallIC);
5213     }
5214 
5215     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5216     return SmallIC;
5217   }
5218 
5219   // Interleave if this is a large loop (small loops are already dealt with by
5220   // this point) that could benefit from interleaving.
5221   if (AggressivelyInterleaveReductions) {
5222     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5223     return IC;
5224   }
5225 
5226   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5227   return 1;
5228 }
5229 
5230 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5231 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5232   // This function calculates the register usage by measuring the highest number
5233   // of values that are alive at a single location. Obviously, this is a very
5234   // rough estimation. We scan the loop in a topological order in order and
5235   // assign a number to each instruction. We use RPO to ensure that defs are
5236   // met before their users. We assume that each instruction that has in-loop
5237   // users starts an interval. We record every time that an in-loop value is
5238   // used, so we have a list of the first and last occurrences of each
5239   // instruction. Next, we transpose this data structure into a multi map that
5240   // holds the list of intervals that *end* at a specific location. This multi
5241   // map allows us to perform a linear search. We scan the instructions linearly
5242   // and record each time that a new interval starts, by placing it in a set.
5243   // If we find this value in the multi-map then we remove it from the set.
5244   // The max register usage is the maximum size of the set.
5245   // We also search for instructions that are defined outside the loop, but are
5246   // used inside the loop. We need this number separately from the max-interval
5247   // usage number because when we unroll, loop-invariant values do not take
5248   // more register.
5249   LoopBlocksDFS DFS(TheLoop);
5250   DFS.perform(LI);
5251 
5252   RegisterUsage RU;
5253 
5254   // Each 'key' in the map opens a new interval. The values
5255   // of the map are the index of the 'last seen' usage of the
5256   // instruction that is the key.
5257   using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5258 
5259   // Maps instruction to its index.
5260   SmallVector<Instruction *, 64> IdxToInstr;
5261   // Marks the end of each interval.
5262   IntervalMap EndPoint;
5263   // Saves the list of instruction indices that are used in the loop.
5264   SmallPtrSet<Instruction *, 8> Ends;
5265   // Saves the list of values that are used in the loop but are defined outside
5266   // the loop (not including non-instruction values such as arguments and
5267   // constants).
5268   SmallSetVector<Instruction *, 8> LoopInvariants;
5269 
5270   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5271     for (Instruction &I : BB->instructionsWithoutDebug()) {
5272       IdxToInstr.push_back(&I);
5273 
5274       // Save the end location of each USE.
5275       for (Value *U : I.operands()) {
5276         auto *Instr = dyn_cast<Instruction>(U);
5277 
5278         // Ignore non-instruction values such as arguments, constants, etc.
5279         // FIXME: Might need some motivation why these values are ignored. If
5280         // for example an argument is used inside the loop it will increase the
5281         // register pressure (so shouldn't we add it to LoopInvariants).
5282         if (!Instr)
5283           continue;
5284 
5285         // If this instruction is outside the loop then record it and continue.
5286         if (!TheLoop->contains(Instr)) {
5287           LoopInvariants.insert(Instr);
5288           continue;
5289         }
5290 
5291         // Overwrite previous end points.
5292         EndPoint[Instr] = IdxToInstr.size();
5293         Ends.insert(Instr);
5294       }
5295     }
5296   }
5297 
5298   // Saves the list of intervals that end with the index in 'key'.
5299   using InstrList = SmallVector<Instruction *, 2>;
5300   SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5301 
5302   // Transpose the EndPoints to a list of values that end at each index.
5303   for (auto &Interval : EndPoint)
5304     TransposeEnds[Interval.second].push_back(Interval.first);
5305 
5306   SmallPtrSet<Instruction *, 8> OpenIntervals;
5307   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5308   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5309 
5310   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5311 
5312   const auto &TTICapture = TTI;
5313   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5314     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5315         (VF.isScalable() &&
5316          !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5317       return 0;
5318     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5319   };
5320 
5321   for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5322     Instruction *I = IdxToInstr[Idx];
5323 
5324     // Remove all of the instructions that end at this location.
5325     InstrList &List = TransposeEnds[Idx];
5326     for (Instruction *ToRemove : List)
5327       OpenIntervals.erase(ToRemove);
5328 
5329     // Ignore instructions that are never used within the loop.
5330     if (!Ends.count(I))
5331       continue;
5332 
5333     // Skip ignored values.
5334     if (ValuesToIgnore.count(I))
5335       continue;
5336 
5337     collectInLoopReductions();
5338 
5339     // For each VF find the maximum usage of registers.
5340     for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5341       // Count the number of registers used, per register class, given all open
5342       // intervals.
5343       // Note that elements in this SmallMapVector will be default constructed
5344       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5345       // there is no previous entry for ClassID.
5346       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5347 
5348       if (VFs[J].isScalar()) {
5349         for (auto *Inst : OpenIntervals) {
5350           unsigned ClassID =
5351               TTI.getRegisterClassForType(false, Inst->getType());
5352           // FIXME: The target might use more than one register for the type
5353           // even in the scalar case.
5354           RegUsage[ClassID] += 1;
5355         }
5356       } else {
5357         collectUniformsAndScalars(VFs[J]);
5358         for (auto *Inst : OpenIntervals) {
5359           // Skip ignored values for VF > 1.
5360           if (VecValuesToIgnore.count(Inst))
5361             continue;
5362           if (isScalarAfterVectorization(Inst, VFs[J])) {
5363             unsigned ClassID =
5364                 TTI.getRegisterClassForType(false, Inst->getType());
5365             // FIXME: The target might use more than one register for the type
5366             // even in the scalar case.
5367             RegUsage[ClassID] += 1;
5368           } else {
5369             unsigned ClassID =
5370                 TTI.getRegisterClassForType(true, Inst->getType());
5371             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5372           }
5373         }
5374       }
5375 
5376       for (const auto &Pair : RegUsage) {
5377         auto &Entry = MaxUsages[J][Pair.first];
5378         Entry = std::max(Entry, Pair.second);
5379       }
5380     }
5381 
5382     LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5383                       << OpenIntervals.size() << '\n');
5384 
5385     // Add the current instruction to the list of open intervals.
5386     OpenIntervals.insert(I);
5387   }
5388 
5389   for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5390     // Note that elements in this SmallMapVector will be default constructed
5391     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5392     // there is no previous entry for ClassID.
5393     SmallMapVector<unsigned, unsigned, 4> Invariant;
5394 
5395     for (auto *Inst : LoopInvariants) {
5396       // FIXME: The target might use more than one register for the type
5397       // even in the scalar case.
5398       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5399         auto *I = cast<Instruction>(U);
5400         return TheLoop != LI->getLoopFor(I->getParent()) ||
5401                isScalarAfterVectorization(I, VFs[Idx]);
5402       });
5403 
5404       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5405       unsigned ClassID =
5406           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5407       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5408     }
5409 
5410     LLVM_DEBUG({
5411       dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5412       dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5413              << " item\n";
5414       for (const auto &pair : MaxUsages[Idx]) {
5415         dbgs() << "LV(REG): RegisterClass: "
5416                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5417                << " registers\n";
5418       }
5419       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5420              << " item\n";
5421       for (const auto &pair : Invariant) {
5422         dbgs() << "LV(REG): RegisterClass: "
5423                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5424                << " registers\n";
5425       }
5426     });
5427 
5428     RU.LoopInvariantRegs = Invariant;
5429     RU.MaxLocalUsers = MaxUsages[Idx];
5430     RUs[Idx] = RU;
5431   }
5432 
5433   return RUs;
5434 }
5435 
5436 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5437                                                            ElementCount VF) {
5438   // TODO: Cost model for emulated masked load/store is completely
5439   // broken. This hack guides the cost model to use an artificially
5440   // high enough value to practically disable vectorization with such
5441   // operations, except where previously deployed legality hack allowed
5442   // using very low cost values. This is to avoid regressions coming simply
5443   // from moving "masked load/store" check from legality to cost model.
5444   // Masked Load/Gather emulation was previously never allowed.
5445   // Limited number of Masked Store/Scatter emulation was allowed.
5446   assert((isPredicatedInst(I)) &&
5447          "Expecting a scalar emulated instruction");
5448   return isa<LoadInst>(I) ||
5449          (isa<StoreInst>(I) &&
5450           NumPredStores > NumberOfStoresToPredicate);
5451 }
5452 
5453 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5454   // If we aren't vectorizing the loop, or if we've already collected the
5455   // instructions to scalarize, there's nothing to do. Collection may already
5456   // have occurred if we have a user-selected VF and are now computing the
5457   // expected cost for interleaving.
5458   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5459     return;
5460 
5461   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5462   // not profitable to scalarize any instructions, the presence of VF in the
5463   // map will indicate that we've analyzed it already.
5464   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5465 
5466   PredicatedBBsAfterVectorization[VF].clear();
5467 
5468   // Find all the instructions that are scalar with predication in the loop and
5469   // determine if it would be better to not if-convert the blocks they are in.
5470   // If so, we also record the instructions to scalarize.
5471   for (BasicBlock *BB : TheLoop->blocks()) {
5472     if (!blockNeedsPredicationForAnyReason(BB))
5473       continue;
5474     for (Instruction &I : *BB)
5475       if (isScalarWithPredication(&I, VF)) {
5476         ScalarCostsTy ScalarCosts;
5477         // Do not apply discount logic for:
5478         // 1. Scalars after vectorization, as there will only be a single copy
5479         // of the instruction.
5480         // 2. Scalable VF, as that would lead to invalid scalarization costs.
5481         // 3. Emulated masked memrefs, if a hacked cost is needed.
5482         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5483             !useEmulatedMaskMemRefHack(&I, VF) &&
5484             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5485           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5486           // Check if we decided to scalarize a call. If so, update the widening
5487           // decision of the call to CM_Scalarize with the computed scalar cost.
5488           for (const auto &[I, _] : ScalarCosts) {
5489             auto *CI = dyn_cast<CallInst>(I);
5490             if (!CI || !CallWideningDecisions.contains({CI, VF}))
5491               continue;
5492             CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5493             CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5494           }
5495         }
5496         // Remember that BB will remain after vectorization.
5497         PredicatedBBsAfterVectorization[VF].insert(BB);
5498         for (auto *Pred : predecessors(BB)) {
5499           if (Pred->getSingleSuccessor() == BB)
5500             PredicatedBBsAfterVectorization[VF].insert(Pred);
5501         }
5502       }
5503   }
5504 }
5505 
5506 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5507     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5508   assert(!isUniformAfterVectorization(PredInst, VF) &&
5509          "Instruction marked uniform-after-vectorization will be predicated");
5510 
5511   // Initialize the discount to zero, meaning that the scalar version and the
5512   // vector version cost the same.
5513   InstructionCost Discount = 0;
5514 
5515   // Holds instructions to analyze. The instructions we visit are mapped in
5516   // ScalarCosts. Those instructions are the ones that would be scalarized if
5517   // we find that the scalar version costs less.
5518   SmallVector<Instruction *, 8> Worklist;
5519 
5520   // Returns true if the given instruction can be scalarized.
5521   auto CanBeScalarized = [&](Instruction *I) -> bool {
5522     // We only attempt to scalarize instructions forming a single-use chain
5523     // from the original predicated block that would otherwise be vectorized.
5524     // Although not strictly necessary, we give up on instructions we know will
5525     // already be scalar to avoid traversing chains that are unlikely to be
5526     // beneficial.
5527     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5528         isScalarAfterVectorization(I, VF))
5529       return false;
5530 
5531     // If the instruction is scalar with predication, it will be analyzed
5532     // separately. We ignore it within the context of PredInst.
5533     if (isScalarWithPredication(I, VF))
5534       return false;
5535 
5536     // If any of the instruction's operands are uniform after vectorization,
5537     // the instruction cannot be scalarized. This prevents, for example, a
5538     // masked load from being scalarized.
5539     //
5540     // We assume we will only emit a value for lane zero of an instruction
5541     // marked uniform after vectorization, rather than VF identical values.
5542     // Thus, if we scalarize an instruction that uses a uniform, we would
5543     // create uses of values corresponding to the lanes we aren't emitting code
5544     // for. This behavior can be changed by allowing getScalarValue to clone
5545     // the lane zero values for uniforms rather than asserting.
5546     for (Use &U : I->operands())
5547       if (auto *J = dyn_cast<Instruction>(U.get()))
5548         if (isUniformAfterVectorization(J, VF))
5549           return false;
5550 
5551     // Otherwise, we can scalarize the instruction.
5552     return true;
5553   };
5554 
5555   // Compute the expected cost discount from scalarizing the entire expression
5556   // feeding the predicated instruction. We currently only consider expressions
5557   // that are single-use instruction chains.
5558   Worklist.push_back(PredInst);
5559   while (!Worklist.empty()) {
5560     Instruction *I = Worklist.pop_back_val();
5561 
5562     // If we've already analyzed the instruction, there's nothing to do.
5563     if (ScalarCosts.contains(I))
5564       continue;
5565 
5566     // Compute the cost of the vector instruction. Note that this cost already
5567     // includes the scalarization overhead of the predicated instruction.
5568     InstructionCost VectorCost = getInstructionCost(I, VF);
5569 
5570     // Compute the cost of the scalarized instruction. This cost is the cost of
5571     // the instruction as if it wasn't if-converted and instead remained in the
5572     // predicated block. We will scale this cost by block probability after
5573     // computing the scalarization overhead.
5574     InstructionCost ScalarCost =
5575         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5576 
5577     // Compute the scalarization overhead of needed insertelement instructions
5578     // and phi nodes.
5579     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5580     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5581       ScalarCost += TTI.getScalarizationOverhead(
5582           cast<VectorType>(toVectorTy(I->getType(), VF)),
5583           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5584           /*Extract*/ false, CostKind);
5585       ScalarCost +=
5586           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5587     }
5588 
5589     // Compute the scalarization overhead of needed extractelement
5590     // instructions. For each of the instruction's operands, if the operand can
5591     // be scalarized, add it to the worklist; otherwise, account for the
5592     // overhead.
5593     for (Use &U : I->operands())
5594       if (auto *J = dyn_cast<Instruction>(U.get())) {
5595         assert(VectorType::isValidElementType(J->getType()) &&
5596                "Instruction has non-scalar type");
5597         if (CanBeScalarized(J))
5598           Worklist.push_back(J);
5599         else if (needsExtract(J, VF)) {
5600           ScalarCost += TTI.getScalarizationOverhead(
5601               cast<VectorType>(toVectorTy(J->getType(), VF)),
5602               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5603               /*Extract*/ true, CostKind);
5604         }
5605       }
5606 
5607     // Scale the total scalar cost by block probability.
5608     ScalarCost /= getReciprocalPredBlockProb();
5609 
5610     // Compute the discount. A non-negative discount means the vector version
5611     // of the instruction costs more, and scalarizing would be beneficial.
5612     Discount += VectorCost - ScalarCost;
5613     ScalarCosts[I] = ScalarCost;
5614   }
5615 
5616   return Discount;
5617 }
5618 
5619 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5620   InstructionCost Cost;
5621 
5622   // If the vector loop gets executed exactly once with the given VF, ignore the
5623   // costs of comparison and induction instructions, as they'll get simplified
5624   // away.
5625   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5626   auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5627   if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5628     addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5629                                          ValuesToIgnoreForVF);
5630 
5631   // For each block.
5632   for (BasicBlock *BB : TheLoop->blocks()) {
5633     InstructionCost BlockCost;
5634 
5635     // For each instruction in the old loop.
5636     for (Instruction &I : BB->instructionsWithoutDebug()) {
5637       // Skip ignored values.
5638       if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5639           (VF.isVector() && VecValuesToIgnore.count(&I)))
5640         continue;
5641 
5642       InstructionCost C = getInstructionCost(&I, VF);
5643 
5644       // Check if we should override the cost.
5645       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5646         C = InstructionCost(ForceTargetInstructionCost);
5647 
5648       BlockCost += C;
5649       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5650                         << VF << " For instruction: " << I << '\n');
5651     }
5652 
5653     // If we are vectorizing a predicated block, it will have been
5654     // if-converted. This means that the block's instructions (aside from
5655     // stores and instructions that may divide by zero) will now be
5656     // unconditionally executed. For the scalar case, we may not always execute
5657     // the predicated block, if it is an if-else block. Thus, scale the block's
5658     // cost by the probability of executing it. blockNeedsPredication from
5659     // Legal is used so as to not include all blocks in tail folded loops.
5660     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5661       BlockCost /= getReciprocalPredBlockProb();
5662 
5663     Cost += BlockCost;
5664   }
5665 
5666   return Cost;
5667 }
5668 
5669 /// Gets Address Access SCEV after verifying that the access pattern
5670 /// is loop invariant except the induction variable dependence.
5671 ///
5672 /// This SCEV can be sent to the Target in order to estimate the address
5673 /// calculation cost.
5674 static const SCEV *getAddressAccessSCEV(
5675               Value *Ptr,
5676               LoopVectorizationLegality *Legal,
5677               PredicatedScalarEvolution &PSE,
5678               const Loop *TheLoop) {
5679 
5680   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5681   if (!Gep)
5682     return nullptr;
5683 
5684   // We are looking for a gep with all loop invariant indices except for one
5685   // which should be an induction variable.
5686   auto *SE = PSE.getSE();
5687   unsigned NumOperands = Gep->getNumOperands();
5688   for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5689     Value *Opd = Gep->getOperand(Idx);
5690     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5691         !Legal->isInductionVariable(Opd))
5692       return nullptr;
5693   }
5694 
5695   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5696   return PSE.getSCEV(Ptr);
5697 }
5698 
5699 InstructionCost
5700 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5701                                                         ElementCount VF) {
5702   assert(VF.isVector() &&
5703          "Scalarization cost of instruction implies vectorization.");
5704   if (VF.isScalable())
5705     return InstructionCost::getInvalid();
5706 
5707   Type *ValTy = getLoadStoreType(I);
5708   auto *SE = PSE.getSE();
5709 
5710   unsigned AS = getLoadStoreAddressSpace(I);
5711   Value *Ptr = getLoadStorePointerOperand(I);
5712   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5713   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5714   //       that it is being called from this specific place.
5715 
5716   // Figure out whether the access is strided and get the stride value
5717   // if it's known in compile time
5718   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5719 
5720   // Get the cost of the scalar memory instruction and address computation.
5721   InstructionCost Cost =
5722       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5723 
5724   // Don't pass *I here, since it is scalar but will actually be part of a
5725   // vectorized loop where the user of it is a vectorized instruction.
5726   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5727   const Align Alignment = getLoadStoreAlignment(I);
5728   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5729                                                       ValTy->getScalarType(),
5730                                                       Alignment, AS, CostKind);
5731 
5732   // Get the overhead of the extractelement and insertelement instructions
5733   // we might create due to scalarization.
5734   Cost += getScalarizationOverhead(I, VF, CostKind);
5735 
5736   // If we have a predicated load/store, it will need extra i1 extracts and
5737   // conditional branches, but may not be executed for each vector lane. Scale
5738   // the cost by the probability of executing the predicated block.
5739   if (isPredicatedInst(I)) {
5740     Cost /= getReciprocalPredBlockProb();
5741 
5742     // Add the cost of an i1 extract and a branch
5743     auto *VecI1Ty =
5744         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5745     Cost += TTI.getScalarizationOverhead(
5746         VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5747         /*Insert=*/false, /*Extract=*/true, CostKind);
5748     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5749 
5750     if (useEmulatedMaskMemRefHack(I, VF))
5751       // Artificially setting to a high enough value to practically disable
5752       // vectorization with such operations.
5753       Cost = 3000000;
5754   }
5755 
5756   return Cost;
5757 }
5758 
5759 InstructionCost
5760 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5761                                                     ElementCount VF) {
5762   Type *ValTy = getLoadStoreType(I);
5763   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5764   Value *Ptr = getLoadStorePointerOperand(I);
5765   unsigned AS = getLoadStoreAddressSpace(I);
5766   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5767   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5768 
5769   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5770          "Stride should be 1 or -1 for consecutive memory access");
5771   const Align Alignment = getLoadStoreAlignment(I);
5772   InstructionCost Cost = 0;
5773   if (Legal->isMaskRequired(I)) {
5774     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5775                                       CostKind);
5776   } else {
5777     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5778     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5779                                 CostKind, OpInfo, I);
5780   }
5781 
5782   bool Reverse = ConsecutiveStride < 0;
5783   if (Reverse)
5784     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5785                                CostKind, 0);
5786   return Cost;
5787 }
5788 
5789 InstructionCost
5790 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5791                                                 ElementCount VF) {
5792   assert(Legal->isUniformMemOp(*I, VF));
5793 
5794   Type *ValTy = getLoadStoreType(I);
5795   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5796   const Align Alignment = getLoadStoreAlignment(I);
5797   unsigned AS = getLoadStoreAddressSpace(I);
5798   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5799   if (isa<LoadInst>(I)) {
5800     return TTI.getAddressComputationCost(ValTy) +
5801            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5802                                CostKind) +
5803            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5804   }
5805   StoreInst *SI = cast<StoreInst>(I);
5806 
5807   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5808   return TTI.getAddressComputationCost(ValTy) +
5809          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5810                              CostKind) +
5811          (IsLoopInvariantStoreValue
5812               ? 0
5813               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5814                                        CostKind, VF.getKnownMinValue() - 1));
5815 }
5816 
5817 InstructionCost
5818 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5819                                                  ElementCount VF) {
5820   Type *ValTy = getLoadStoreType(I);
5821   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5822   const Align Alignment = getLoadStoreAlignment(I);
5823   const Value *Ptr = getLoadStorePointerOperand(I);
5824 
5825   return TTI.getAddressComputationCost(VectorTy) +
5826          TTI.getGatherScatterOpCost(
5827              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5828              TargetTransformInfo::TCK_RecipThroughput, I);
5829 }
5830 
5831 InstructionCost
5832 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5833                                                    ElementCount VF) {
5834   const auto *Group = getInterleavedAccessGroup(I);
5835   assert(Group && "Fail to get an interleaved access group.");
5836 
5837   Instruction *InsertPos = Group->getInsertPos();
5838   Type *ValTy = getLoadStoreType(InsertPos);
5839   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5840   unsigned AS = getLoadStoreAddressSpace(InsertPos);
5841   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5842 
5843   unsigned InterleaveFactor = Group->getFactor();
5844   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5845 
5846   // Holds the indices of existing members in the interleaved group.
5847   SmallVector<unsigned, 4> Indices;
5848   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5849     if (Group->getMember(IF))
5850       Indices.push_back(IF);
5851 
5852   // Calculate the cost of the whole interleaved group.
5853   bool UseMaskForGaps =
5854       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5855       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5856   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5857       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5858       Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5859       UseMaskForGaps);
5860 
5861   if (Group->isReverse()) {
5862     // TODO: Add support for reversed masked interleaved access.
5863     assert(!Legal->isMaskRequired(I) &&
5864            "Reverse masked interleaved access not supported.");
5865     Cost += Group->getNumMembers() *
5866             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5867                                CostKind, 0);
5868   }
5869   return Cost;
5870 }
5871 
5872 std::optional<InstructionCost>
5873 LoopVectorizationCostModel::getReductionPatternCost(
5874     Instruction *I, ElementCount VF, Type *Ty,
5875     TTI::TargetCostKind CostKind) const {
5876   using namespace llvm::PatternMatch;
5877   // Early exit for no inloop reductions
5878   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5879     return std::nullopt;
5880   auto *VectorTy = cast<VectorType>(Ty);
5881 
5882   // We are looking for a pattern of, and finding the minimal acceptable cost:
5883   //  reduce(mul(ext(A), ext(B))) or
5884   //  reduce(mul(A, B)) or
5885   //  reduce(ext(A)) or
5886   //  reduce(A).
5887   // The basic idea is that we walk down the tree to do that, finding the root
5888   // reduction instruction in InLoopReductionImmediateChains. From there we find
5889   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5890   // of the components. If the reduction cost is lower then we return it for the
5891   // reduction instruction and 0 for the other instructions in the pattern. If
5892   // it is not we return an invalid cost specifying the orignal cost method
5893   // should be used.
5894   Instruction *RetI = I;
5895   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5896     if (!RetI->hasOneUser())
5897       return std::nullopt;
5898     RetI = RetI->user_back();
5899   }
5900 
5901   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5902       RetI->user_back()->getOpcode() == Instruction::Add) {
5903     RetI = RetI->user_back();
5904   }
5905 
5906   // Test if the found instruction is a reduction, and if not return an invalid
5907   // cost specifying the parent to use the original cost modelling.
5908   if (!InLoopReductionImmediateChains.count(RetI))
5909     return std::nullopt;
5910 
5911   // Find the reduction this chain is a part of and calculate the basic cost of
5912   // the reduction on its own.
5913   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5914   Instruction *ReductionPhi = LastChain;
5915   while (!isa<PHINode>(ReductionPhi))
5916     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5917 
5918   const RecurrenceDescriptor &RdxDesc =
5919       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5920 
5921   InstructionCost BaseCost;
5922   RecurKind RK = RdxDesc.getRecurrenceKind();
5923   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5924     Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5925     BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5926                                           RdxDesc.getFastMathFlags(), CostKind);
5927   } else {
5928     BaseCost = TTI.getArithmeticReductionCost(
5929         RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5930   }
5931 
5932   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5933   // normal fmul instruction to the cost of the fadd reduction.
5934   if (RK == RecurKind::FMulAdd)
5935     BaseCost +=
5936         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5937 
5938   // If we're using ordered reductions then we can just return the base cost
5939   // here, since getArithmeticReductionCost calculates the full ordered
5940   // reduction cost when FP reassociation is not allowed.
5941   if (useOrderedReductions(RdxDesc))
5942     return BaseCost;
5943 
5944   // Get the operand that was not the reduction chain and match it to one of the
5945   // patterns, returning the better cost if it is found.
5946   Instruction *RedOp = RetI->getOperand(1) == LastChain
5947                            ? dyn_cast<Instruction>(RetI->getOperand(0))
5948                            : dyn_cast<Instruction>(RetI->getOperand(1));
5949 
5950   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5951 
5952   Instruction *Op0, *Op1;
5953   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5954       match(RedOp,
5955             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5956       match(Op0, m_ZExtOrSExt(m_Value())) &&
5957       Op0->getOpcode() == Op1->getOpcode() &&
5958       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5959       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5960       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5961 
5962     // Matched reduce.add(ext(mul(ext(A), ext(B)))
5963     // Note that the extend opcodes need to all match, or if A==B they will have
5964     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5965     // which is equally fine.
5966     bool IsUnsigned = isa<ZExtInst>(Op0);
5967     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5968     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5969 
5970     InstructionCost ExtCost =
5971         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5972                              TTI::CastContextHint::None, CostKind, Op0);
5973     InstructionCost MulCost =
5974         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5975     InstructionCost Ext2Cost =
5976         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5977                              TTI::CastContextHint::None, CostKind, RedOp);
5978 
5979     InstructionCost RedCost = TTI.getMulAccReductionCost(
5980         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5981 
5982     if (RedCost.isValid() &&
5983         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5984       return I == RetI ? RedCost : 0;
5985   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5986              !TheLoop->isLoopInvariant(RedOp)) {
5987     // Matched reduce(ext(A))
5988     bool IsUnsigned = isa<ZExtInst>(RedOp);
5989     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5990     InstructionCost RedCost = TTI.getExtendedReductionCost(
5991         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5992         RdxDesc.getFastMathFlags(), CostKind);
5993 
5994     InstructionCost ExtCost =
5995         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5996                              TTI::CastContextHint::None, CostKind, RedOp);
5997     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5998       return I == RetI ? RedCost : 0;
5999   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6000              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6001     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6002         Op0->getOpcode() == Op1->getOpcode() &&
6003         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6004       bool IsUnsigned = isa<ZExtInst>(Op0);
6005       Type *Op0Ty = Op0->getOperand(0)->getType();
6006       Type *Op1Ty = Op1->getOperand(0)->getType();
6007       Type *LargestOpTy =
6008           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6009                                                                     : Op0Ty;
6010       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6011 
6012       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6013       // different sizes. We take the largest type as the ext to reduce, and add
6014       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6015       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6016           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6017           TTI::CastContextHint::None, CostKind, Op0);
6018       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6019           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6020           TTI::CastContextHint::None, CostKind, Op1);
6021       InstructionCost MulCost =
6022           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6023 
6024       InstructionCost RedCost = TTI.getMulAccReductionCost(
6025           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6026       InstructionCost ExtraExtCost = 0;
6027       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6028         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6029         ExtraExtCost = TTI.getCastInstrCost(
6030             ExtraExtOp->getOpcode(), ExtType,
6031             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6032             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6033       }
6034 
6035       if (RedCost.isValid() &&
6036           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6037         return I == RetI ? RedCost : 0;
6038     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6039       // Matched reduce.add(mul())
6040       InstructionCost MulCost =
6041           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6042 
6043       InstructionCost RedCost = TTI.getMulAccReductionCost(
6044           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6045 
6046       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6047         return I == RetI ? RedCost : 0;
6048     }
6049   }
6050 
6051   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6052 }
6053 
6054 InstructionCost
6055 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6056                                                      ElementCount VF) {
6057   // Calculate scalar cost only. Vectorization cost should be ready at this
6058   // moment.
6059   if (VF.isScalar()) {
6060     Type *ValTy = getLoadStoreType(I);
6061     const Align Alignment = getLoadStoreAlignment(I);
6062     unsigned AS = getLoadStoreAddressSpace(I);
6063 
6064     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6065     return TTI.getAddressComputationCost(ValTy) +
6066            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6067                                TTI::TCK_RecipThroughput, OpInfo, I);
6068   }
6069   return getWideningCost(I, VF);
6070 }
6071 
6072 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6073     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6074 
6075   // There is no mechanism yet to create a scalable scalarization loop,
6076   // so this is currently Invalid.
6077   if (VF.isScalable())
6078     return InstructionCost::getInvalid();
6079 
6080   if (VF.isScalar())
6081     return 0;
6082 
6083   InstructionCost Cost = 0;
6084   Type *RetTy = toVectorTy(I->getType(), VF);
6085   if (!RetTy->isVoidTy() &&
6086       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6087     Cost += TTI.getScalarizationOverhead(
6088         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6089         /*Insert*/ true,
6090         /*Extract*/ false, CostKind);
6091 
6092   // Some targets keep addresses scalar.
6093   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6094     return Cost;
6095 
6096   // Some targets support efficient element stores.
6097   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6098     return Cost;
6099 
6100   // Collect operands to consider.
6101   CallInst *CI = dyn_cast<CallInst>(I);
6102   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6103 
6104   // Skip operands that do not require extraction/scalarization and do not incur
6105   // any overhead.
6106   SmallVector<Type *> Tys;
6107   for (auto *V : filterExtractingOperands(Ops, VF))
6108     Tys.push_back(maybeVectorizeType(V->getType(), VF));
6109   return Cost + TTI.getOperandsScalarizationOverhead(
6110                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6111 }
6112 
6113 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6114   if (VF.isScalar())
6115     return;
6116   NumPredStores = 0;
6117   for (BasicBlock *BB : TheLoop->blocks()) {
6118     // For each instruction in the old loop.
6119     for (Instruction &I : *BB) {
6120       Value *Ptr =  getLoadStorePointerOperand(&I);
6121       if (!Ptr)
6122         continue;
6123 
6124       // TODO: We should generate better code and update the cost model for
6125       // predicated uniform stores. Today they are treated as any other
6126       // predicated store (see added test cases in
6127       // invariant-store-vectorization.ll).
6128       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6129         NumPredStores++;
6130 
6131       if (Legal->isUniformMemOp(I, VF)) {
6132         auto IsLegalToScalarize = [&]() {
6133           if (!VF.isScalable())
6134             // Scalarization of fixed length vectors "just works".
6135             return true;
6136 
6137           // We have dedicated lowering for unpredicated uniform loads and
6138           // stores.  Note that even with tail folding we know that at least
6139           // one lane is active (i.e. generalized predication is not possible
6140           // here), and the logic below depends on this fact.
6141           if (!foldTailByMasking())
6142             return true;
6143 
6144           // For scalable vectors, a uniform memop load is always
6145           // uniform-by-parts  and we know how to scalarize that.
6146           if (isa<LoadInst>(I))
6147             return true;
6148 
6149           // A uniform store isn't neccessarily uniform-by-part
6150           // and we can't assume scalarization.
6151           auto &SI = cast<StoreInst>(I);
6152           return TheLoop->isLoopInvariant(SI.getValueOperand());
6153         };
6154 
6155         const InstructionCost GatherScatterCost =
6156           isLegalGatherOrScatter(&I, VF) ?
6157           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6158 
6159         // Load: Scalar load + broadcast
6160         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6161         // FIXME: This cost is a significant under-estimate for tail folded
6162         // memory ops.
6163         const InstructionCost ScalarizationCost =
6164             IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6165                                  : InstructionCost::getInvalid();
6166 
6167         // Choose better solution for the current VF,  Note that Invalid
6168         // costs compare as maximumal large.  If both are invalid, we get
6169         // scalable invalid which signals a failure and a vectorization abort.
6170         if (GatherScatterCost < ScalarizationCost)
6171           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6172         else
6173           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6174         continue;
6175       }
6176 
6177       // We assume that widening is the best solution when possible.
6178       if (memoryInstructionCanBeWidened(&I, VF)) {
6179         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6180         int ConsecutiveStride = Legal->isConsecutivePtr(
6181             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6182         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6183                "Expected consecutive stride.");
6184         InstWidening Decision =
6185             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6186         setWideningDecision(&I, VF, Decision, Cost);
6187         continue;
6188       }
6189 
6190       // Choose between Interleaving, Gather/Scatter or Scalarization.
6191       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6192       unsigned NumAccesses = 1;
6193       if (isAccessInterleaved(&I)) {
6194         const auto *Group = getInterleavedAccessGroup(&I);
6195         assert(Group && "Fail to get an interleaved access group.");
6196 
6197         // Make one decision for the whole group.
6198         if (getWideningDecision(&I, VF) != CM_Unknown)
6199           continue;
6200 
6201         NumAccesses = Group->getNumMembers();
6202         if (interleavedAccessCanBeWidened(&I, VF))
6203           InterleaveCost = getInterleaveGroupCost(&I, VF);
6204       }
6205 
6206       InstructionCost GatherScatterCost =
6207           isLegalGatherOrScatter(&I, VF)
6208               ? getGatherScatterCost(&I, VF) * NumAccesses
6209               : InstructionCost::getInvalid();
6210 
6211       InstructionCost ScalarizationCost =
6212           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6213 
6214       // Choose better solution for the current VF,
6215       // write down this decision and use it during vectorization.
6216       InstructionCost Cost;
6217       InstWidening Decision;
6218       if (InterleaveCost <= GatherScatterCost &&
6219           InterleaveCost < ScalarizationCost) {
6220         Decision = CM_Interleave;
6221         Cost = InterleaveCost;
6222       } else if (GatherScatterCost < ScalarizationCost) {
6223         Decision = CM_GatherScatter;
6224         Cost = GatherScatterCost;
6225       } else {
6226         Decision = CM_Scalarize;
6227         Cost = ScalarizationCost;
6228       }
6229       // If the instructions belongs to an interleave group, the whole group
6230       // receives the same decision. The whole group receives the cost, but
6231       // the cost will actually be assigned to one instruction.
6232       if (const auto *Group = getInterleavedAccessGroup(&I))
6233         setWideningDecision(Group, VF, Decision, Cost);
6234       else
6235         setWideningDecision(&I, VF, Decision, Cost);
6236     }
6237   }
6238 
6239   // Make sure that any load of address and any other address computation
6240   // remains scalar unless there is gather/scatter support. This avoids
6241   // inevitable extracts into address registers, and also has the benefit of
6242   // activating LSR more, since that pass can't optimize vectorized
6243   // addresses.
6244   if (TTI.prefersVectorizedAddressing())
6245     return;
6246 
6247   // Start with all scalar pointer uses.
6248   SmallPtrSet<Instruction *, 8> AddrDefs;
6249   for (BasicBlock *BB : TheLoop->blocks())
6250     for (Instruction &I : *BB) {
6251       Instruction *PtrDef =
6252         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6253       if (PtrDef && TheLoop->contains(PtrDef) &&
6254           getWideningDecision(&I, VF) != CM_GatherScatter)
6255         AddrDefs.insert(PtrDef);
6256     }
6257 
6258   // Add all instructions used to generate the addresses.
6259   SmallVector<Instruction *, 4> Worklist;
6260   append_range(Worklist, AddrDefs);
6261   while (!Worklist.empty()) {
6262     Instruction *I = Worklist.pop_back_val();
6263     for (auto &Op : I->operands())
6264       if (auto *InstOp = dyn_cast<Instruction>(Op))
6265         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6266             AddrDefs.insert(InstOp).second)
6267           Worklist.push_back(InstOp);
6268   }
6269 
6270   for (auto *I : AddrDefs) {
6271     if (isa<LoadInst>(I)) {
6272       // Setting the desired widening decision should ideally be handled in
6273       // by cost functions, but since this involves the task of finding out
6274       // if the loaded register is involved in an address computation, it is
6275       // instead changed here when we know this is the case.
6276       InstWidening Decision = getWideningDecision(I, VF);
6277       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6278         // Scalarize a widened load of address.
6279         setWideningDecision(
6280             I, VF, CM_Scalarize,
6281             (VF.getKnownMinValue() *
6282              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6283       else if (const auto *Group = getInterleavedAccessGroup(I)) {
6284         // Scalarize an interleave group of address loads.
6285         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6286           if (Instruction *Member = Group->getMember(I))
6287             setWideningDecision(
6288                 Member, VF, CM_Scalarize,
6289                 (VF.getKnownMinValue() *
6290                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6291         }
6292       }
6293     } else
6294       // Make sure I gets scalarized and a cost estimate without
6295       // scalarization overhead.
6296       ForcedScalars[VF].insert(I);
6297   }
6298 }
6299 
6300 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6301   assert(!VF.isScalar() &&
6302          "Trying to set a vectorization decision for a scalar VF");
6303 
6304   auto ForcedScalar = ForcedScalars.find(VF);
6305   for (BasicBlock *BB : TheLoop->blocks()) {
6306     // For each instruction in the old loop.
6307     for (Instruction &I : *BB) {
6308       CallInst *CI = dyn_cast<CallInst>(&I);
6309 
6310       if (!CI)
6311         continue;
6312 
6313       InstructionCost ScalarCost = InstructionCost::getInvalid();
6314       InstructionCost VectorCost = InstructionCost::getInvalid();
6315       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6316       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6317       Function *ScalarFunc = CI->getCalledFunction();
6318       Type *ScalarRetTy = CI->getType();
6319       SmallVector<Type *, 4> Tys, ScalarTys;
6320       for (auto &ArgOp : CI->args())
6321         ScalarTys.push_back(ArgOp->getType());
6322 
6323       // Estimate cost of scalarized vector call. The source operands are
6324       // assumed to be vectors, so we need to extract individual elements from
6325       // there, execute VF scalar calls, and then gather the result into the
6326       // vector return value.
6327       InstructionCost ScalarCallCost =
6328           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6329 
6330       // Compute costs of unpacking argument values for the scalar calls and
6331       // packing the return values to a vector.
6332       InstructionCost ScalarizationCost =
6333           getScalarizationOverhead(CI, VF, CostKind);
6334 
6335       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6336       // Honor ForcedScalars and UniformAfterVectorization decisions.
6337       // TODO: For calls, it might still be more profitable to widen. Use
6338       // VPlan-based cost model to compare different options.
6339       if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6340                              ForcedScalar->second.contains(CI)) ||
6341                             isUniformAfterVectorization(CI, VF))) {
6342         setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6343                                 Intrinsic::not_intrinsic, std::nullopt,
6344                                 ScalarCost);
6345         continue;
6346       }
6347 
6348       bool MaskRequired = Legal->isMaskRequired(CI);
6349       // Compute corresponding vector type for return value and arguments.
6350       Type *RetTy = toVectorTy(ScalarRetTy, VF);
6351       for (Type *ScalarTy : ScalarTys)
6352         Tys.push_back(toVectorTy(ScalarTy, VF));
6353 
6354       // An in-loop reduction using an fmuladd intrinsic is a special case;
6355       // we don't want the normal cost for that intrinsic.
6356       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6357         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6358           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6359                                   getVectorIntrinsicIDForCall(CI, TLI),
6360                                   std::nullopt, *RedCost);
6361           continue;
6362         }
6363 
6364       // Find the cost of vectorizing the call, if we can find a suitable
6365       // vector variant of the function.
6366       bool UsesMask = false;
6367       VFInfo FuncInfo;
6368       Function *VecFunc = nullptr;
6369       // Search through any available variants for one we can use at this VF.
6370       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6371         // Must match requested VF.
6372         if (Info.Shape.VF != VF)
6373           continue;
6374 
6375         // Must take a mask argument if one is required
6376         if (MaskRequired && !Info.isMasked())
6377           continue;
6378 
6379         // Check that all parameter kinds are supported
6380         bool ParamsOk = true;
6381         for (VFParameter Param : Info.Shape.Parameters) {
6382           switch (Param.ParamKind) {
6383           case VFParamKind::Vector:
6384             break;
6385           case VFParamKind::OMP_Uniform: {
6386             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6387             // Make sure the scalar parameter in the loop is invariant.
6388             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6389                                               TheLoop))
6390               ParamsOk = false;
6391             break;
6392           }
6393           case VFParamKind::OMP_Linear: {
6394             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6395             // Find the stride for the scalar parameter in this loop and see if
6396             // it matches the stride for the variant.
6397             // TODO: do we need to figure out the cost of an extract to get the
6398             // first lane? Or do we hope that it will be folded away?
6399             ScalarEvolution *SE = PSE.getSE();
6400             const auto *SAR =
6401                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6402 
6403             if (!SAR || SAR->getLoop() != TheLoop) {
6404               ParamsOk = false;
6405               break;
6406             }
6407 
6408             const SCEVConstant *Step =
6409                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6410 
6411             if (!Step ||
6412                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6413               ParamsOk = false;
6414 
6415             break;
6416           }
6417           case VFParamKind::GlobalPredicate:
6418             UsesMask = true;
6419             break;
6420           default:
6421             ParamsOk = false;
6422             break;
6423           }
6424         }
6425 
6426         if (!ParamsOk)
6427           continue;
6428 
6429         // Found a suitable candidate, stop here.
6430         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6431         FuncInfo = Info;
6432         break;
6433       }
6434 
6435       // Add in the cost of synthesizing a mask if one wasn't required.
6436       InstructionCost MaskCost = 0;
6437       if (VecFunc && UsesMask && !MaskRequired)
6438         MaskCost = TTI.getShuffleCost(
6439             TargetTransformInfo::SK_Broadcast,
6440             VectorType::get(IntegerType::getInt1Ty(
6441                                 VecFunc->getFunctionType()->getContext()),
6442                             VF));
6443 
6444       if (TLI && VecFunc && !CI->isNoBuiltin())
6445         VectorCost =
6446             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6447 
6448       // Find the cost of an intrinsic; some targets may have instructions that
6449       // perform the operation without needing an actual call.
6450       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6451       if (IID != Intrinsic::not_intrinsic)
6452         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6453 
6454       InstructionCost Cost = ScalarCost;
6455       InstWidening Decision = CM_Scalarize;
6456 
6457       if (VectorCost <= Cost) {
6458         Cost = VectorCost;
6459         Decision = CM_VectorCall;
6460       }
6461 
6462       if (IntrinsicCost <= Cost) {
6463         Cost = IntrinsicCost;
6464         Decision = CM_IntrinsicCall;
6465       }
6466 
6467       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6468                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6469     }
6470   }
6471 }
6472 
6473 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6474   if (!Legal->isInvariant(Op))
6475     return false;
6476   // Consider Op invariant, if it or its operands aren't predicated
6477   // instruction in the loop. In that case, it is not trivially hoistable.
6478   auto *OpI = dyn_cast<Instruction>(Op);
6479   return !OpI || !TheLoop->contains(OpI) ||
6480          (!isPredicatedInst(OpI) &&
6481           (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6482           all_of(OpI->operands(),
6483                  [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6484 }
6485 
6486 InstructionCost
6487 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6488                                                ElementCount VF) {
6489   // If we know that this instruction will remain uniform, check the cost of
6490   // the scalar version.
6491   if (isUniformAfterVectorization(I, VF))
6492     VF = ElementCount::getFixed(1);
6493 
6494   if (VF.isVector() && isProfitableToScalarize(I, VF))
6495     return InstsToScalarize[VF][I];
6496 
6497   // Forced scalars do not have any scalarization overhead.
6498   auto ForcedScalar = ForcedScalars.find(VF);
6499   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6500     auto InstSet = ForcedScalar->second;
6501     if (InstSet.count(I))
6502       return getInstructionCost(I, ElementCount::getFixed(1)) *
6503              VF.getKnownMinValue();
6504   }
6505 
6506   Type *RetTy = I->getType();
6507   if (canTruncateToMinimalBitwidth(I, VF))
6508     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6509   auto *SE = PSE.getSE();
6510   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6511 
6512   auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6513                                                 ElementCount VF) -> bool {
6514     if (VF.isScalar())
6515       return true;
6516 
6517     auto Scalarized = InstsToScalarize.find(VF);
6518     assert(Scalarized != InstsToScalarize.end() &&
6519            "VF not yet analyzed for scalarization profitability");
6520     return !Scalarized->second.count(I) &&
6521            llvm::all_of(I->users(), [&](User *U) {
6522              auto *UI = cast<Instruction>(U);
6523              return !Scalarized->second.count(UI);
6524            });
6525   };
6526   (void)HasSingleCopyAfterVectorization;
6527 
6528   Type *VectorTy;
6529   if (isScalarAfterVectorization(I, VF)) {
6530     // With the exception of GEPs and PHIs, after scalarization there should
6531     // only be one copy of the instruction generated in the loop. This is
6532     // because the VF is either 1, or any instructions that need scalarizing
6533     // have already been dealt with by the time we get here. As a result,
6534     // it means we don't have to multiply the instruction cost by VF.
6535     assert(I->getOpcode() == Instruction::GetElementPtr ||
6536            I->getOpcode() == Instruction::PHI ||
6537            (I->getOpcode() == Instruction::BitCast &&
6538             I->getType()->isPointerTy()) ||
6539            HasSingleCopyAfterVectorization(I, VF));
6540     VectorTy = RetTy;
6541   } else
6542     VectorTy = toVectorTy(RetTy, VF);
6543 
6544   if (VF.isVector() && VectorTy->isVectorTy() &&
6545       !TTI.getNumberOfParts(VectorTy))
6546     return InstructionCost::getInvalid();
6547 
6548   // TODO: We need to estimate the cost of intrinsic calls.
6549   switch (I->getOpcode()) {
6550   case Instruction::GetElementPtr:
6551     // We mark this instruction as zero-cost because the cost of GEPs in
6552     // vectorized code depends on whether the corresponding memory instruction
6553     // is scalarized or not. Therefore, we handle GEPs with the memory
6554     // instruction cost.
6555     return 0;
6556   case Instruction::Br: {
6557     // In cases of scalarized and predicated instructions, there will be VF
6558     // predicated blocks in the vectorized loop. Each branch around these
6559     // blocks requires also an extract of its vector compare i1 element.
6560     // Note that the conditional branch from the loop latch will be replaced by
6561     // a single branch controlling the loop, so there is no extra overhead from
6562     // scalarization.
6563     bool ScalarPredicatedBB = false;
6564     BranchInst *BI = cast<BranchInst>(I);
6565     if (VF.isVector() && BI->isConditional() &&
6566         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6567          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6568         BI->getParent() != TheLoop->getLoopLatch())
6569       ScalarPredicatedBB = true;
6570 
6571     if (ScalarPredicatedBB) {
6572       // Not possible to scalarize scalable vector with predicated instructions.
6573       if (VF.isScalable())
6574         return InstructionCost::getInvalid();
6575       // Return cost for branches around scalarized and predicated blocks.
6576       auto *VecI1Ty =
6577           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6578       return (
6579           TTI.getScalarizationOverhead(
6580               VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6581               /*Insert*/ false, /*Extract*/ true, CostKind) +
6582           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6583     }
6584 
6585     if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6586       // The back-edge branch will remain, as will all scalar branches.
6587       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6588 
6589     // This branch will be eliminated by if-conversion.
6590     return 0;
6591     // Note: We currently assume zero cost for an unconditional branch inside
6592     // a predicated block since it will become a fall-through, although we
6593     // may decide in the future to call TTI for all branches.
6594   }
6595   case Instruction::Switch: {
6596     if (VF.isScalar())
6597       return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6598     auto *Switch = cast<SwitchInst>(I);
6599     return Switch->getNumCases() *
6600            TTI.getCmpSelInstrCost(
6601                Instruction::ICmp,
6602                toVectorTy(Switch->getCondition()->getType(), VF),
6603                toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6604                CmpInst::ICMP_EQ, CostKind);
6605   }
6606   case Instruction::PHI: {
6607     auto *Phi = cast<PHINode>(I);
6608 
6609     // First-order recurrences are replaced by vector shuffles inside the loop.
6610     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6611       // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6612       // penultimate value of the recurrence.
6613       // TODO: Consider vscale_range info.
6614       if (VF.isScalable() && VF.getKnownMinValue() == 1)
6615         return InstructionCost::getInvalid();
6616       SmallVector<int> Mask(VF.getKnownMinValue());
6617       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6618       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6619                                 cast<VectorType>(VectorTy), Mask, CostKind,
6620                                 VF.getKnownMinValue() - 1);
6621     }
6622 
6623     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6624     // converted into select instructions. We require N - 1 selects per phi
6625     // node, where N is the number of incoming values.
6626     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6627       Type *ResultTy = Phi->getType();
6628 
6629       // All instructions in an Any-of reduction chain are narrowed to bool.
6630       // Check if that is the case for this phi node.
6631       auto *HeaderUser = cast_if_present<PHINode>(
6632           find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6633             auto *Phi = dyn_cast<PHINode>(U);
6634             if (Phi && Phi->getParent() == TheLoop->getHeader())
6635               return Phi;
6636             return nullptr;
6637           }));
6638       if (HeaderUser) {
6639         auto &ReductionVars = Legal->getReductionVars();
6640         auto Iter = ReductionVars.find(HeaderUser);
6641         if (Iter != ReductionVars.end() &&
6642             RecurrenceDescriptor::isAnyOfRecurrenceKind(
6643                 Iter->second.getRecurrenceKind()))
6644           ResultTy = Type::getInt1Ty(Phi->getContext());
6645       }
6646       return (Phi->getNumIncomingValues() - 1) *
6647              TTI.getCmpSelInstrCost(
6648                  Instruction::Select, toVectorTy(ResultTy, VF),
6649                  toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6650                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6651     }
6652 
6653     // When tail folding with EVL, if the phi is part of an out of loop
6654     // reduction then it will be transformed into a wide vp_merge.
6655     if (VF.isVector() && foldTailWithEVL() &&
6656         Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6657       IntrinsicCostAttributes ICA(
6658           Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6659           {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6660       return TTI.getIntrinsicInstrCost(ICA, CostKind);
6661     }
6662 
6663     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6664   }
6665   case Instruction::UDiv:
6666   case Instruction::SDiv:
6667   case Instruction::URem:
6668   case Instruction::SRem:
6669     if (VF.isVector() && isPredicatedInst(I)) {
6670       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6671       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6672         ScalarCost : SafeDivisorCost;
6673     }
6674     // We've proven all lanes safe to speculate, fall through.
6675     [[fallthrough]];
6676   case Instruction::Add:
6677   case Instruction::Sub: {
6678     auto Info = Legal->getHistogramInfo(I);
6679     if (Info && VF.isVector()) {
6680       const HistogramInfo *HGram = Info.value();
6681       // Assume that a non-constant update value (or a constant != 1) requires
6682       // a multiply, and add that into the cost.
6683       InstructionCost MulCost = TTI::TCC_Free;
6684       ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6685       if (!RHS || RHS->getZExtValue() != 1)
6686         MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
6687 
6688       // Find the cost of the histogram operation itself.
6689       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6690       Type *ScalarTy = I->getType();
6691       Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6692       IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6693                                   Type::getVoidTy(I->getContext()),
6694                                   {PtrTy, ScalarTy, MaskTy});
6695 
6696       // Add the costs together with the add/sub operation.
6697       return TTI.getIntrinsicInstrCost(
6698                  ICA, TargetTransformInfo::TCK_RecipThroughput) +
6699              MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
6700     }
6701     [[fallthrough]];
6702   }
6703   case Instruction::FAdd:
6704   case Instruction::FSub:
6705   case Instruction::Mul:
6706   case Instruction::FMul:
6707   case Instruction::FDiv:
6708   case Instruction::FRem:
6709   case Instruction::Shl:
6710   case Instruction::LShr:
6711   case Instruction::AShr:
6712   case Instruction::And:
6713   case Instruction::Or:
6714   case Instruction::Xor: {
6715     // If we're speculating on the stride being 1, the multiplication may
6716     // fold away.  We can generalize this for all operations using the notion
6717     // of neutral elements.  (TODO)
6718     if (I->getOpcode() == Instruction::Mul &&
6719         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6720          PSE.getSCEV(I->getOperand(1))->isOne()))
6721       return 0;
6722 
6723     // Detect reduction patterns
6724     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6725       return *RedCost;
6726 
6727     // Certain instructions can be cheaper to vectorize if they have a constant
6728     // second vector operand. One example of this are shifts on x86.
6729     Value *Op2 = I->getOperand(1);
6730     if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6731         isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6732       Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6733     }
6734     auto Op2Info = TTI.getOperandInfo(Op2);
6735     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6736         shouldConsiderInvariant(Op2))
6737       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6738 
6739     SmallVector<const Value *, 4> Operands(I->operand_values());
6740     return TTI.getArithmeticInstrCost(
6741         I->getOpcode(), VectorTy, CostKind,
6742         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6743         Op2Info, Operands, I, TLI);
6744   }
6745   case Instruction::FNeg: {
6746     return TTI.getArithmeticInstrCost(
6747         I->getOpcode(), VectorTy, CostKind,
6748         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6749         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6750         I->getOperand(0), I);
6751   }
6752   case Instruction::Select: {
6753     SelectInst *SI = cast<SelectInst>(I);
6754     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6755     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6756 
6757     const Value *Op0, *Op1;
6758     using namespace llvm::PatternMatch;
6759     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6760                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6761       // select x, y, false --> x & y
6762       // select x, true, y --> x | y
6763       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6764       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6765       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6766               Op1->getType()->getScalarSizeInBits() == 1);
6767 
6768       SmallVector<const Value *, 2> Operands{Op0, Op1};
6769       return TTI.getArithmeticInstrCost(
6770           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6771           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6772     }
6773 
6774     Type *CondTy = SI->getCondition()->getType();
6775     if (!ScalarCond)
6776       CondTy = VectorType::get(CondTy, VF);
6777 
6778     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6779     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6780       Pred = Cmp->getPredicate();
6781     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6782                                   CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6783                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6784   }
6785   case Instruction::ICmp:
6786   case Instruction::FCmp: {
6787     Type *ValTy = I->getOperand(0)->getType();
6788 
6789     if (canTruncateToMinimalBitwidth(I, VF)) {
6790       Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6791       (void)Op0AsInstruction;
6792       assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6793               MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6794              "if both the operand and the compare are marked for "
6795              "truncation, they must have the same bitwidth");
6796       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6797     }
6798 
6799     VectorTy = toVectorTy(ValTy, VF);
6800     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6801                                   cast<CmpInst>(I)->getPredicate(), CostKind,
6802                                   {TTI::OK_AnyValue, TTI::OP_None},
6803                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6804   }
6805   case Instruction::Store:
6806   case Instruction::Load: {
6807     ElementCount Width = VF;
6808     if (Width.isVector()) {
6809       InstWidening Decision = getWideningDecision(I, Width);
6810       assert(Decision != CM_Unknown &&
6811              "CM decision should be taken at this point");
6812       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6813         return InstructionCost::getInvalid();
6814       if (Decision == CM_Scalarize)
6815         Width = ElementCount::getFixed(1);
6816     }
6817     VectorTy = toVectorTy(getLoadStoreType(I), Width);
6818     return getMemoryInstructionCost(I, VF);
6819   }
6820   case Instruction::BitCast:
6821     if (I->getType()->isPointerTy())
6822       return 0;
6823     [[fallthrough]];
6824   case Instruction::ZExt:
6825   case Instruction::SExt:
6826   case Instruction::FPToUI:
6827   case Instruction::FPToSI:
6828   case Instruction::FPExt:
6829   case Instruction::PtrToInt:
6830   case Instruction::IntToPtr:
6831   case Instruction::SIToFP:
6832   case Instruction::UIToFP:
6833   case Instruction::Trunc:
6834   case Instruction::FPTrunc: {
6835     // Computes the CastContextHint from a Load/Store instruction.
6836     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6837       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6838              "Expected a load or a store!");
6839 
6840       if (VF.isScalar() || !TheLoop->contains(I))
6841         return TTI::CastContextHint::Normal;
6842 
6843       switch (getWideningDecision(I, VF)) {
6844       case LoopVectorizationCostModel::CM_GatherScatter:
6845         return TTI::CastContextHint::GatherScatter;
6846       case LoopVectorizationCostModel::CM_Interleave:
6847         return TTI::CastContextHint::Interleave;
6848       case LoopVectorizationCostModel::CM_Scalarize:
6849       case LoopVectorizationCostModel::CM_Widen:
6850         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6851                                         : TTI::CastContextHint::Normal;
6852       case LoopVectorizationCostModel::CM_Widen_Reverse:
6853         return TTI::CastContextHint::Reversed;
6854       case LoopVectorizationCostModel::CM_Unknown:
6855         llvm_unreachable("Instr did not go through cost modelling?");
6856       case LoopVectorizationCostModel::CM_VectorCall:
6857       case LoopVectorizationCostModel::CM_IntrinsicCall:
6858         llvm_unreachable_internal("Instr has invalid widening decision");
6859       }
6860 
6861       llvm_unreachable("Unhandled case!");
6862     };
6863 
6864     unsigned Opcode = I->getOpcode();
6865     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6866     // For Trunc, the context is the only user, which must be a StoreInst.
6867     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6868       if (I->hasOneUse())
6869         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6870           CCH = ComputeCCH(Store);
6871     }
6872     // For Z/Sext, the context is the operand, which must be a LoadInst.
6873     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6874              Opcode == Instruction::FPExt) {
6875       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6876         CCH = ComputeCCH(Load);
6877     }
6878 
6879     // We optimize the truncation of induction variables having constant
6880     // integer steps. The cost of these truncations is the same as the scalar
6881     // operation.
6882     if (isOptimizableIVTruncate(I, VF)) {
6883       auto *Trunc = cast<TruncInst>(I);
6884       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6885                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6886     }
6887 
6888     // Detect reduction patterns
6889     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6890       return *RedCost;
6891 
6892     Type *SrcScalarTy = I->getOperand(0)->getType();
6893     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6894     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6895       SrcScalarTy =
6896           IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6897     Type *SrcVecTy =
6898         VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6899 
6900     if (canTruncateToMinimalBitwidth(I, VF)) {
6901       // If the result type is <= the source type, there will be no extend
6902       // after truncating the users to the minimal required bitwidth.
6903       if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6904           (I->getOpcode() == Instruction::ZExt ||
6905            I->getOpcode() == Instruction::SExt))
6906         return 0;
6907     }
6908 
6909     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6910   }
6911   case Instruction::Call:
6912     return getVectorCallCost(cast<CallInst>(I), VF);
6913   case Instruction::ExtractValue:
6914     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
6915   case Instruction::Alloca:
6916     // We cannot easily widen alloca to a scalable alloca, as
6917     // the result would need to be a vector of pointers.
6918     if (VF.isScalable())
6919       return InstructionCost::getInvalid();
6920     [[fallthrough]];
6921   default:
6922     // This opcode is unknown. Assume that it is the same as 'mul'.
6923     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6924   } // end of switch.
6925 }
6926 
6927 void LoopVectorizationCostModel::collectValuesToIgnore() {
6928   // Ignore ephemeral values.
6929   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6930 
6931   SmallVector<Value *, 4> DeadInterleavePointerOps;
6932   SmallVector<Value *, 4> DeadOps;
6933 
6934   // If a scalar epilogue is required, users outside the loop won't use
6935   // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6936   // that is the case.
6937   bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6938   auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6939     return RequiresScalarEpilogue &&
6940            !TheLoop->contains(cast<Instruction>(U)->getParent());
6941   };
6942 
6943   LoopBlocksDFS DFS(TheLoop);
6944   DFS.perform(LI);
6945   MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6946   for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6947     for (Instruction &I : reverse(*BB)) {
6948       // Find all stores to invariant variables. Since they are going to sink
6949       // outside the loop we do not need calculate cost for them.
6950       StoreInst *SI;
6951       if ((SI = dyn_cast<StoreInst>(&I)) &&
6952           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6953         ValuesToIgnore.insert(&I);
6954         DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6955             SI->getValueOperand());
6956       }
6957 
6958       if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6959         continue;
6960 
6961       // Add instructions that would be trivially dead and are only used by
6962       // values already ignored to DeadOps to seed worklist.
6963       if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6964           all_of(I.users(), [this, IsLiveOutDead](User *U) {
6965             return VecValuesToIgnore.contains(U) ||
6966                    ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6967           }))
6968         DeadOps.push_back(&I);
6969 
6970       // For interleave groups, we only create a pointer for the start of the
6971       // interleave group. Queue up addresses of group members except the insert
6972       // position for further processing.
6973       if (isAccessInterleaved(&I)) {
6974         auto *Group = getInterleavedAccessGroup(&I);
6975         if (Group->getInsertPos() == &I)
6976           continue;
6977         Value *PointerOp = getLoadStorePointerOperand(&I);
6978         DeadInterleavePointerOps.push_back(PointerOp);
6979       }
6980 
6981       // Queue branches for analysis. They are dead, if their successors only
6982       // contain dead instructions.
6983       if (auto *Br = dyn_cast<BranchInst>(&I)) {
6984         if (Br->isConditional())
6985           DeadOps.push_back(&I);
6986       }
6987     }
6988 
6989   // Mark ops feeding interleave group members as free, if they are only used
6990   // by other dead computations.
6991   for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6992     auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6993     if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6994           Instruction *UI = cast<Instruction>(U);
6995           return !VecValuesToIgnore.contains(U) &&
6996                  (!isAccessInterleaved(UI) ||
6997                   getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6998         }))
6999       continue;
7000     VecValuesToIgnore.insert(Op);
7001     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
7002   }
7003 
7004   for (const auto &[_, Ops] : DeadInvariantStoreOps) {
7005     for (Value *Op : ArrayRef(Ops).drop_back())
7006       DeadOps.push_back(Op);
7007   }
7008   // Mark ops that would be trivially dead and are only used by ignored
7009   // instructions as free.
7010   BasicBlock *Header = TheLoop->getHeader();
7011 
7012   // Returns true if the block contains only dead instructions. Such blocks will
7013   // be removed by VPlan-to-VPlan transforms and won't be considered by the
7014   // VPlan-based cost model, so skip them in the legacy cost-model as well.
7015   auto IsEmptyBlock = [this](BasicBlock *BB) {
7016     return all_of(*BB, [this](Instruction &I) {
7017       return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
7018              (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
7019     });
7020   };
7021   for (unsigned I = 0; I != DeadOps.size(); ++I) {
7022     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
7023 
7024     // Check if the branch should be considered dead.
7025     if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
7026       BasicBlock *ThenBB = Br->getSuccessor(0);
7027       BasicBlock *ElseBB = Br->getSuccessor(1);
7028       // Don't considers branches leaving the loop for simplification.
7029       if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
7030         continue;
7031       bool ThenEmpty = IsEmptyBlock(ThenBB);
7032       bool ElseEmpty = IsEmptyBlock(ElseBB);
7033       if ((ThenEmpty && ElseEmpty) ||
7034           (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
7035            ElseBB->phis().empty()) ||
7036           (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
7037            ThenBB->phis().empty())) {
7038         VecValuesToIgnore.insert(Br);
7039         DeadOps.push_back(Br->getCondition());
7040       }
7041       continue;
7042     }
7043 
7044     // Skip any op that shouldn't be considered dead.
7045     if (!Op || !TheLoop->contains(Op) ||
7046         (isa<PHINode>(Op) && Op->getParent() == Header) ||
7047         !wouldInstructionBeTriviallyDead(Op, TLI) ||
7048         any_of(Op->users(), [this, IsLiveOutDead](User *U) {
7049           return !VecValuesToIgnore.contains(U) &&
7050                  !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
7051         }))
7052       continue;
7053 
7054     if (!TheLoop->contains(Op->getParent()))
7055       continue;
7056 
7057     // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
7058     // which applies for both scalar and vector versions. Otherwise it is only
7059     // dead in vector versions, so only add it to VecValuesToIgnore.
7060     if (all_of(Op->users(),
7061                [this](User *U) { return ValuesToIgnore.contains(U); }))
7062       ValuesToIgnore.insert(Op);
7063 
7064     VecValuesToIgnore.insert(Op);
7065     DeadOps.append(Op->op_begin(), Op->op_end());
7066   }
7067 
7068   // Ignore type-promoting instructions we identified during reduction
7069   // detection.
7070   for (const auto &Reduction : Legal->getReductionVars()) {
7071     const RecurrenceDescriptor &RedDes = Reduction.second;
7072     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7073     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7074   }
7075   // Ignore type-casting instructions we identified during induction
7076   // detection.
7077   for (const auto &Induction : Legal->getInductionVars()) {
7078     const InductionDescriptor &IndDes = Induction.second;
7079     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7080     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7081   }
7082 }
7083 
7084 void LoopVectorizationCostModel::collectInLoopReductions() {
7085   for (const auto &Reduction : Legal->getReductionVars()) {
7086     PHINode *Phi = Reduction.first;
7087     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7088 
7089     // We don't collect reductions that are type promoted (yet).
7090     if (RdxDesc.getRecurrenceType() != Phi->getType())
7091       continue;
7092 
7093     // If the target would prefer this reduction to happen "in-loop", then we
7094     // want to record it as such.
7095     unsigned Opcode = RdxDesc.getOpcode();
7096     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7097         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7098                                    TargetTransformInfo::ReductionFlags()))
7099       continue;
7100 
7101     // Check that we can correctly put the reductions into the loop, by
7102     // finding the chain of operations that leads from the phi to the loop
7103     // exit value.
7104     SmallVector<Instruction *, 4> ReductionOperations =
7105         RdxDesc.getReductionOpChain(Phi, TheLoop);
7106     bool InLoop = !ReductionOperations.empty();
7107 
7108     if (InLoop) {
7109       InLoopReductions.insert(Phi);
7110       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7111       Instruction *LastChain = Phi;
7112       for (auto *I : ReductionOperations) {
7113         InLoopReductionImmediateChains[I] = LastChain;
7114         LastChain = I;
7115       }
7116     }
7117     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7118                       << " reduction for phi: " << *Phi << "\n");
7119   }
7120 }
7121 
7122 // This function will select a scalable VF if the target supports scalable
7123 // vectors and a fixed one otherwise.
7124 // TODO: we could return a pair of values that specify the max VF and
7125 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7126 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7127 // doesn't have a cost model that can choose which plan to execute if
7128 // more than one is generated.
7129 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7130                                      LoopVectorizationCostModel &CM) {
7131   unsigned WidestType;
7132   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7133 
7134   TargetTransformInfo::RegisterKind RegKind =
7135       TTI.enableScalableVectorization()
7136           ? TargetTransformInfo::RGK_ScalableVector
7137           : TargetTransformInfo::RGK_FixedWidthVector;
7138 
7139   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7140   unsigned N = RegSize.getKnownMinValue() / WidestType;
7141   return ElementCount::get(N, RegSize.isScalable());
7142 }
7143 
7144 VectorizationFactor
7145 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7146   ElementCount VF = UserVF;
7147   // Outer loop handling: They may require CFG and instruction level
7148   // transformations before even evaluating whether vectorization is profitable.
7149   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7150   // the vectorization pipeline.
7151   if (!OrigLoop->isInnermost()) {
7152     // If the user doesn't provide a vectorization factor, determine a
7153     // reasonable one.
7154     if (UserVF.isZero()) {
7155       VF = determineVPlanVF(TTI, CM);
7156       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7157 
7158       // Make sure we have a VF > 1 for stress testing.
7159       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7160         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7161                           << "overriding computed VF.\n");
7162         VF = ElementCount::getFixed(4);
7163       }
7164     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7165                !ForceTargetSupportsScalableVectors) {
7166       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7167                         << "not supported by the target.\n");
7168       reportVectorizationFailure(
7169           "Scalable vectorization requested but not supported by the target",
7170           "the scalable user-specified vectorization width for outer-loop "
7171           "vectorization cannot be used because the target does not support "
7172           "scalable vectors.",
7173           "ScalableVFUnfeasible", ORE, OrigLoop);
7174       return VectorizationFactor::Disabled();
7175     }
7176     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7177     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7178            "VF needs to be a power of two");
7179     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7180                       << "VF " << VF << " to build VPlans.\n");
7181     buildVPlans(VF, VF);
7182 
7183     // For VPlan build stress testing, we bail out after VPlan construction.
7184     if (VPlanBuildStressTest)
7185       return VectorizationFactor::Disabled();
7186 
7187     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7188   }
7189 
7190   LLVM_DEBUG(
7191       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7192                 "VPlan-native path.\n");
7193   return VectorizationFactor::Disabled();
7194 }
7195 
7196 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7197   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7198   CM.collectValuesToIgnore();
7199   CM.collectElementTypesForWidening();
7200 
7201   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7202   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7203     return;
7204 
7205   // Invalidate interleave groups if all blocks of loop will be predicated.
7206   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7207       !useMaskedInterleavedAccesses(TTI)) {
7208     LLVM_DEBUG(
7209         dbgs()
7210         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7211            "which requires masked-interleaved support.\n");
7212     if (CM.InterleaveInfo.invalidateGroups())
7213       // Invalidating interleave groups also requires invalidating all decisions
7214       // based on them, which includes widening decisions and uniform and scalar
7215       // values.
7216       CM.invalidateCostModelingDecisions();
7217   }
7218 
7219   if (CM.foldTailByMasking())
7220     Legal->prepareToFoldTailByMasking();
7221 
7222   ElementCount MaxUserVF =
7223       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7224   if (UserVF) {
7225     if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7226       reportVectorizationInfo(
7227           "UserVF ignored because it may be larger than the maximal safe VF",
7228           "InvalidUserVF", ORE, OrigLoop);
7229     } else {
7230       assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7231              "VF needs to be a power of two");
7232       // Collect the instructions (and their associated costs) that will be more
7233       // profitable to scalarize.
7234       CM.collectInLoopReductions();
7235       if (CM.selectUserVectorizationFactor(UserVF)) {
7236         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7237         buildVPlansWithVPRecipes(UserVF, UserVF);
7238         LLVM_DEBUG(printPlans(dbgs()));
7239         return;
7240       }
7241       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7242                               "InvalidCost", ORE, OrigLoop);
7243     }
7244   }
7245 
7246   // Collect the Vectorization Factor Candidates.
7247   SmallVector<ElementCount> VFCandidates;
7248   for (auto VF = ElementCount::getFixed(1);
7249        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7250     VFCandidates.push_back(VF);
7251   for (auto VF = ElementCount::getScalable(1);
7252        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7253     VFCandidates.push_back(VF);
7254 
7255   CM.collectInLoopReductions();
7256   for (const auto &VF : VFCandidates) {
7257     // Collect Uniform and Scalar instructions after vectorization with VF.
7258     CM.collectUniformsAndScalars(VF);
7259 
7260     // Collect the instructions (and their associated costs) that will be more
7261     // profitable to scalarize.
7262     if (VF.isVector())
7263       CM.collectInstsToScalarize(VF);
7264   }
7265 
7266   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7267   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7268 
7269   LLVM_DEBUG(printPlans(dbgs()));
7270 }
7271 
7272 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7273                                              ElementCount VF) const {
7274   if (ForceTargetInstructionCost.getNumOccurrences())
7275     return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7276   return CM.getInstructionCost(UI, VF);
7277 }
7278 
7279 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7280   return CM.ValuesToIgnore.contains(UI) ||
7281          (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7282          SkipCostComputation.contains(UI);
7283 }
7284 
7285 InstructionCost
7286 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7287                                           VPCostContext &CostCtx) const {
7288   InstructionCost Cost;
7289   // Cost modeling for inductions is inaccurate in the legacy cost model
7290   // compared to the recipes that are generated. To match here initially during
7291   // VPlan cost model bring up directly use the induction costs from the legacy
7292   // cost model. Note that we do this as pre-processing; the VPlan may not have
7293   // any recipes associated with the original induction increment instruction
7294   // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7295   // the cost of induction phis and increments (both that are represented by
7296   // recipes and those that are not), to avoid distinguishing between them here,
7297   // and skip all recipes that represent induction phis and increments (the
7298   // former case) later on, if they exist, to avoid counting them twice.
7299   // Similarly we pre-compute the cost of any optimized truncates.
7300   // TODO: Switch to more accurate costing based on VPlan.
7301   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7302     Instruction *IVInc = cast<Instruction>(
7303         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7304     SmallVector<Instruction *> IVInsts = {IVInc};
7305     for (unsigned I = 0; I != IVInsts.size(); I++) {
7306       for (Value *Op : IVInsts[I]->operands()) {
7307         auto *OpI = dyn_cast<Instruction>(Op);
7308         if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7309           continue;
7310         IVInsts.push_back(OpI);
7311       }
7312     }
7313     IVInsts.push_back(IV);
7314     for (User *U : IV->users()) {
7315       auto *CI = cast<Instruction>(U);
7316       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7317         continue;
7318       IVInsts.push_back(CI);
7319     }
7320 
7321     // If the vector loop gets executed exactly once with the given VF, ignore
7322     // the costs of comparison and induction instructions, as they'll get
7323     // simplified away.
7324     // TODO: Remove this code after stepping away from the legacy cost model and
7325     // adding code to simplify VPlans before calculating their costs.
7326     auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7327     if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7328       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
7329                                            CostCtx.SkipCostComputation);
7330 
7331     for (Instruction *IVInst : IVInsts) {
7332       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7333         continue;
7334       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7335       LLVM_DEBUG({
7336         dbgs() << "Cost of " << InductionCost << " for VF " << VF
7337                << ": induction instruction " << *IVInst << "\n";
7338       });
7339       Cost += InductionCost;
7340       CostCtx.SkipCostComputation.insert(IVInst);
7341     }
7342   }
7343 
7344   /// Compute the cost of all exiting conditions of the loop using the legacy
7345   /// cost model. This is to match the legacy behavior, which adds the cost of
7346   /// all exit conditions. Note that this over-estimates the cost, as there will
7347   /// be a single condition to control the vector loop.
7348   SmallVector<BasicBlock *> Exiting;
7349   CM.TheLoop->getExitingBlocks(Exiting);
7350   SetVector<Instruction *> ExitInstrs;
7351   // Collect all exit conditions.
7352   for (BasicBlock *EB : Exiting) {
7353     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7354     if (!Term)
7355       continue;
7356     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7357       ExitInstrs.insert(CondI);
7358     }
7359   }
7360   // Compute the cost of all instructions only feeding the exit conditions.
7361   for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7362     Instruction *CondI = ExitInstrs[I];
7363     if (!OrigLoop->contains(CondI) ||
7364         !CostCtx.SkipCostComputation.insert(CondI).second)
7365       continue;
7366     InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7367     LLVM_DEBUG({
7368       dbgs() << "Cost of " << CondICost << " for VF " << VF
7369              << ": exit condition instruction " << *CondI << "\n";
7370     });
7371     Cost += CondICost;
7372     for (Value *Op : CondI->operands()) {
7373       auto *OpI = dyn_cast<Instruction>(Op);
7374       if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7375             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7376                    !ExitInstrs.contains(cast<Instruction>(U));
7377           }))
7378         continue;
7379       ExitInstrs.insert(OpI);
7380     }
7381   }
7382 
7383   // The legacy cost model has special logic to compute the cost of in-loop
7384   // reductions, which may be smaller than the sum of all instructions involved
7385   // in the reduction.
7386   // TODO: Switch to costing based on VPlan once the logic has been ported.
7387   for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7388     if (ForceTargetInstructionCost.getNumOccurrences())
7389       continue;
7390 
7391     if (!CM.isInLoopReduction(RedPhi))
7392       continue;
7393 
7394     const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7395     SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7396                                                  ChainOps.end());
7397     auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7398       return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7399     };
7400     // Also include the operands of instructions in the chain, as the cost-model
7401     // may mark extends as free.
7402     //
7403     // For ARM, some of the instruction can folded into the reducion
7404     // instruction. So we need to mark all folded instructions free.
7405     // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7406     // instruction.
7407     for (auto *ChainOp : ChainOps) {
7408       for (Value *Op : ChainOp->operands()) {
7409         if (auto *I = dyn_cast<Instruction>(Op)) {
7410           ChainOpsAndOperands.insert(I);
7411           if (I->getOpcode() == Instruction::Mul) {
7412             auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7413             auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7414             if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7415                 Ext0->getOpcode() == Ext1->getOpcode()) {
7416               ChainOpsAndOperands.insert(Ext0);
7417               ChainOpsAndOperands.insert(Ext1);
7418             }
7419           }
7420         }
7421       }
7422     }
7423 
7424     // Pre-compute the cost for I, if it has a reduction pattern cost.
7425     for (Instruction *I : ChainOpsAndOperands) {
7426       auto ReductionCost = CM.getReductionPatternCost(
7427           I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7428       if (!ReductionCost)
7429         continue;
7430 
7431       assert(!CostCtx.SkipCostComputation.contains(I) &&
7432              "reduction op visited multiple times");
7433       CostCtx.SkipCostComputation.insert(I);
7434       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7435                         << ":\n in-loop reduction " << *I << "\n");
7436       Cost += *ReductionCost;
7437     }
7438   }
7439 
7440   // Pre-compute the costs for branches except for the backedge, as the number
7441   // of replicate regions in a VPlan may not directly match the number of
7442   // branches, which would lead to different decisions.
7443   // TODO: Compute cost of branches for each replicate region in the VPlan,
7444   // which is more accurate than the legacy cost model.
7445   for (BasicBlock *BB : OrigLoop->blocks()) {
7446     if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7447       continue;
7448     CostCtx.SkipCostComputation.insert(BB->getTerminator());
7449     if (BB == OrigLoop->getLoopLatch())
7450       continue;
7451     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7452     Cost += BranchCost;
7453   }
7454 
7455   // Pre-compute costs for instructions that are forced-scalar or profitable to
7456   // scalarize. Their costs will be computed separately in the legacy cost
7457   // model.
7458   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7459     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7460       continue;
7461     CostCtx.SkipCostComputation.insert(ForcedScalar);
7462     InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7463     LLVM_DEBUG({
7464       dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7465              << ": forced scalar " << *ForcedScalar << "\n";
7466     });
7467     Cost += ForcedCost;
7468   }
7469   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7470     if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7471       continue;
7472     CostCtx.SkipCostComputation.insert(Scalarized);
7473     LLVM_DEBUG({
7474       dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7475              << ": profitable to scalarize " << *Scalarized << "\n";
7476     });
7477     Cost += ScalarCost;
7478   }
7479 
7480   return Cost;
7481 }
7482 
7483 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7484                                                ElementCount VF) const {
7485   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7486   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7487 
7488   // Now compute and add the VPlan-based cost.
7489   Cost += Plan.cost(VF, CostCtx);
7490 #ifndef NDEBUG
7491   unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7492   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7493                     << " (Estimated cost per lane: ");
7494   if (Cost.isValid()) {
7495     double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7496     LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7497   } else /* No point dividing an invalid cost - it will still be invalid */
7498     LLVM_DEBUG(dbgs() << "Invalid");
7499   LLVM_DEBUG(dbgs() << ")\n");
7500 #endif
7501   return Cost;
7502 }
7503 
7504 #ifndef NDEBUG
7505 /// Return true if the original loop \ TheLoop contains any instructions that do
7506 /// not have corresponding recipes in \p Plan and are not marked to be ignored
7507 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
7508 /// cost-model did not account for.
7509 static bool planContainsAdditionalSimplifications(VPlan &Plan,
7510                                                   VPCostContext &CostCtx,
7511                                                   Loop *TheLoop) {
7512   // First collect all instructions for the recipes in Plan.
7513   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7514     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7515       return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7516     if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7517       return &WidenMem->getIngredient();
7518     return nullptr;
7519   };
7520 
7521   DenseSet<Instruction *> SeenInstrs;
7522   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7523   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7524     for (VPRecipeBase &R : *VPBB) {
7525       if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7526         auto *IG = IR->getInterleaveGroup();
7527         unsigned NumMembers = IG->getNumMembers();
7528         for (unsigned I = 0; I != NumMembers; ++I) {
7529           if (Instruction *M = IG->getMember(I))
7530             SeenInstrs.insert(M);
7531         }
7532         continue;
7533       }
7534       // The VPlan-based cost model is more accurate for partial reduction and
7535       // comparing against the legacy cost isn't desirable.
7536       if (isa<VPPartialReductionRecipe>(&R))
7537         return true;
7538       if (Instruction *UI = GetInstructionForCost(&R))
7539         SeenInstrs.insert(UI);
7540     }
7541   }
7542 
7543   // Return true if the loop contains any instructions that are not also part of
7544   // the VPlan or are skipped for VPlan-based cost computations. This indicates
7545   // that the VPlan contains extra simplifications.
7546   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7547                                     TheLoop](BasicBlock *BB) {
7548     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7549       if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7550         return false;
7551       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7552     });
7553   });
7554 }
7555 #endif
7556 
7557 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7558   if (VPlans.empty())
7559     return VectorizationFactor::Disabled();
7560   // If there is a single VPlan with a single VF, return it directly.
7561   VPlan &FirstPlan = *VPlans[0];
7562   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7563     return {*FirstPlan.vectorFactors().begin(), 0, 0};
7564 
7565   ElementCount ScalarVF = ElementCount::getFixed(1);
7566   assert(hasPlanWithVF(ScalarVF) &&
7567          "More than a single plan/VF w/o any plan having scalar VF");
7568 
7569   // TODO: Compute scalar cost using VPlan-based cost model.
7570   InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7571   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7572   VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7573   VectorizationFactor BestFactor = ScalarFactor;
7574 
7575   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7576   if (ForceVectorization) {
7577     // Ignore scalar width, because the user explicitly wants vectorization.
7578     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7579     // evaluation.
7580     BestFactor.Cost = InstructionCost::getMax();
7581   }
7582 
7583   for (auto &P : VPlans) {
7584     for (ElementCount VF : P->vectorFactors()) {
7585       if (VF.isScalar())
7586         continue;
7587       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7588         LLVM_DEBUG(
7589             dbgs()
7590             << "LV: Not considering vector loop of width " << VF
7591             << " because it will not generate any vector instructions.\n");
7592         continue;
7593       }
7594 
7595       InstructionCost Cost = cost(*P, VF);
7596       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7597       if (isMoreProfitable(CurrentFactor, BestFactor))
7598         BestFactor = CurrentFactor;
7599 
7600       // If profitable add it to ProfitableVF list.
7601       if (isMoreProfitable(CurrentFactor, ScalarFactor))
7602         ProfitableVFs.push_back(CurrentFactor);
7603     }
7604   }
7605 
7606 #ifndef NDEBUG
7607   // Select the optimal vectorization factor according to the legacy cost-model.
7608   // This is now only used to verify the decisions by the new VPlan-based
7609   // cost-model and will be retired once the VPlan-based cost-model is
7610   // stabilized.
7611   VectorizationFactor LegacyVF = selectVectorizationFactor();
7612   VPlan &BestPlan = getPlanFor(BestFactor.Width);
7613 
7614   // Pre-compute the cost and use it to check if BestPlan contains any
7615   // simplifications not accounted for in the legacy cost model. If that's the
7616   // case, don't trigger the assertion, as the extra simplifications may cause a
7617   // different VF to be picked by the VPlan-based cost model.
7618   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7619   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7620   assert((BestFactor.Width == LegacyVF.Width ||
7621           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7622                                                 CostCtx, OrigLoop) ||
7623           planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
7624                                                 CostCtx, OrigLoop)) &&
7625          " VPlan cost model and legacy cost model disagreed");
7626   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7627          "when vectorizing, the scalar cost must be computed.");
7628 #endif
7629 
7630   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7631   return BestFactor;
7632 }
7633 
7634 static void addRuntimeUnrollDisableMetaData(Loop *L) {
7635   SmallVector<Metadata *, 4> MDs;
7636   // Reserve first location for self reference to the LoopID metadata node.
7637   MDs.push_back(nullptr);
7638   bool IsUnrollMetadata = false;
7639   MDNode *LoopID = L->getLoopID();
7640   if (LoopID) {
7641     // First find existing loop unrolling disable metadata.
7642     for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7643       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7644       if (MD) {
7645         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7646         IsUnrollMetadata =
7647             S && S->getString().starts_with("llvm.loop.unroll.disable");
7648       }
7649       MDs.push_back(LoopID->getOperand(I));
7650     }
7651   }
7652 
7653   if (!IsUnrollMetadata) {
7654     // Add runtime unroll disable metadata.
7655     LLVMContext &Context = L->getHeader()->getContext();
7656     SmallVector<Metadata *, 1> DisableOperands;
7657     DisableOperands.push_back(
7658         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7659     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7660     MDs.push_back(DisableNode);
7661     MDNode *NewLoopID = MDNode::get(Context, MDs);
7662     // Set operand 0 to refer to the loop id itself.
7663     NewLoopID->replaceOperandWith(0, NewLoopID);
7664     L->setLoopID(NewLoopID);
7665   }
7666 }
7667 
7668 // If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7669 // fix the reduction's scalar PHI node by adding the incoming value from the
7670 // main vector loop.
7671 static void fixReductionScalarResumeWhenVectorizingEpilog(
7672     VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7673     BasicBlock *BypassBlock) {
7674   auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7675   if (!EpiRedResult ||
7676       EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7677     return;
7678 
7679   auto *EpiRedHeaderPhi =
7680       cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7681   const RecurrenceDescriptor &RdxDesc =
7682       EpiRedHeaderPhi->getRecurrenceDescriptor();
7683   Value *MainResumeValue =
7684       EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7685   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7686           RdxDesc.getRecurrenceKind())) {
7687     auto *Cmp = cast<ICmpInst>(MainResumeValue);
7688     assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7689            "AnyOf expected to start with ICMP_NE");
7690     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7691            "AnyOf expected to start by comparing main resume value to original "
7692            "start value");
7693     MainResumeValue = Cmp->getOperand(0);
7694   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
7695                  RdxDesc.getRecurrenceKind())) {
7696     using namespace llvm::PatternMatch;
7697     Value *Cmp, *OrigResumeV;
7698     bool IsExpectedPattern =
7699         match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7700                                         m_Specific(RdxDesc.getSentinelValue()),
7701                                         m_Value(OrigResumeV))) &&
7702         match(Cmp,
7703               m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7704                              m_Specific(RdxDesc.getRecurrenceStartValue())));
7705     assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7706     (void)IsExpectedPattern;
7707     MainResumeValue = OrigResumeV;
7708   }
7709   PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7710 
7711   // When fixing reductions in the epilogue loop we should already have
7712   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7713   // over the incoming values correctly.
7714   using namespace VPlanPatternMatch;
7715   auto IsResumePhi = [](VPUser *U) {
7716     return match(
7717         U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7718   };
7719   assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7720          "ResumePhi must have a single user");
7721   auto *EpiResumePhiVPI =
7722       cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7723   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7724   EpiResumePhi->setIncomingValueForBlock(
7725       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7726 }
7727 
7728 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7729     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7730     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7731     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7732   assert(BestVPlan.hasVF(BestVF) &&
7733          "Trying to execute plan with unsupported VF");
7734   assert(BestVPlan.hasUF(BestUF) &&
7735          "Trying to execute plan with unsupported UF");
7736   assert(
7737       ((VectorizingEpilogue && ExpandedSCEVs) ||
7738        (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7739       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7740 
7741   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7742   // cost model is complete for better cost estimates.
7743   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7744                               OrigLoop->getHeader()->getContext());
7745   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7746   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
7747 
7748   // Perform the actual loop transformation.
7749   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7750                          &BestVPlan, OrigLoop->getParentLoop(),
7751                          Legal->getWidestInductionType());
7752 
7753 #ifdef EXPENSIVE_CHECKS
7754   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7755 #endif
7756 
7757   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7758   // making any changes to the CFG.
7759   if (!BestVPlan.getEntry()->empty())
7760     BestVPlan.getEntry()->execute(&State);
7761 
7762   if (!ILV.getTripCount())
7763     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7764   else
7765     assert(VectorizingEpilogue && "should only re-use the existing trip "
7766                                   "count during epilogue vectorization");
7767 
7768   // 1. Set up the skeleton for vectorization, including vector pre-header and
7769   // middle block. The vector loop is created during VPlan execution.
7770   VPBasicBlock *VectorPH =
7771       cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7772   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7773       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7774   if (VectorizingEpilogue)
7775     VPlanTransforms::removeDeadRecipes(BestVPlan);
7776 
7777   // Only use noalias metadata when using memory checks guaranteeing no overlap
7778   // across all iterations.
7779   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7780   std::unique_ptr<LoopVersioning> LVer = nullptr;
7781   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7782       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7783 
7784     //  We currently don't use LoopVersioning for the actual loop cloning but we
7785     //  still use it to add the noalias metadata.
7786     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7787     //        metadata.
7788     LVer = std::make_unique<LoopVersioning>(
7789         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7790         PSE.getSE());
7791     State.LVer = &*LVer;
7792     State.LVer->prepareNoAliasMetadata();
7793   }
7794 
7795   ILV.printDebugTracesAtStart();
7796 
7797   //===------------------------------------------------===//
7798   //
7799   // Notice: any optimization or new instruction that go
7800   // into the code below should also be implemented in
7801   // the cost-model.
7802   //
7803   //===------------------------------------------------===//
7804 
7805   // 2. Copy and widen instructions from the old loop into the new loop.
7806   BestVPlan.prepareToExecute(
7807       ILV.getTripCount(),
7808       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7809   replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7810 
7811   BestVPlan.execute(&State);
7812 
7813   auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7814   // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7815   // values from the additional bypass block.
7816   if (VectorizingEpilogue) {
7817     assert(!ILV.Legal->hasUncountableEarlyExit() &&
7818            "Epilogue vectorisation not yet supported with early exits");
7819     BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7820     for (VPRecipeBase &R : *MiddleVPBB) {
7821       fixReductionScalarResumeWhenVectorizingEpilog(
7822           &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7823     }
7824     BasicBlock *PH = OrigLoop->getLoopPreheader();
7825     for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7826       auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7827       Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
7828       Inc->setIncomingValueForBlock(BypassBlock, V);
7829     }
7830   }
7831 
7832   // 2.6. Maintain Loop Hints
7833   // Keep all loop hints from the original loop on the vector loop (we'll
7834   // replace the vectorizer-specific hints below).
7835   if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7836     MDNode *OrigLoopID = OrigLoop->getLoopID();
7837 
7838     std::optional<MDNode *> VectorizedLoopID =
7839         makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7840                                         LLVMLoopVectorizeFollowupVectorized});
7841 
7842     VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7843     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7844     if (VectorizedLoopID) {
7845       L->setLoopID(*VectorizedLoopID);
7846     } else {
7847       // Keep all loop hints from the original loop on the vector loop (we'll
7848       // replace the vectorizer-specific hints below).
7849       if (MDNode *LID = OrigLoop->getLoopID())
7850         L->setLoopID(LID);
7851 
7852       LoopVectorizeHints Hints(L, true, *ORE);
7853       Hints.setAlreadyVectorized();
7854     }
7855     TargetTransformInfo::UnrollingPreferences UP;
7856     TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7857     if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7858       addRuntimeUnrollDisableMetaData(L);
7859   }
7860 
7861   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7862   //    predication, updating analyses.
7863   ILV.fixVectorizedLoop(State);
7864 
7865   ILV.printDebugTracesAtEnd();
7866 
7867   // 4. Adjust branch weight of the branch in the middle block.
7868   if (BestVPlan.getVectorLoopRegion()) {
7869     auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7870     auto *MiddleTerm =
7871         cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7872     if (MiddleTerm->isConditional() &&
7873         hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7874       // Assume that `Count % VectorTripCount` is equally distributed.
7875       unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7876       assert(TripCount > 0 && "trip count should not be zero");
7877       const uint32_t Weights[] = {1, TripCount - 1};
7878       setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7879     }
7880   }
7881 
7882   return State.ExpandedSCEVs;
7883 }
7884 
7885 //===--------------------------------------------------------------------===//
7886 // EpilogueVectorizerMainLoop
7887 //===--------------------------------------------------------------------===//
7888 
7889 /// This function is partially responsible for generating the control flow
7890 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7891 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7892     const SCEV2ValueTy &ExpandedSCEVs) {
7893   createVectorLoopSkeleton("");
7894 
7895   // Generate the code to check the minimum iteration count of the vector
7896   // epilogue (see below).
7897   EPI.EpilogueIterationCountCheck =
7898       emitIterationCountCheck(LoopScalarPreHeader, true);
7899   EPI.EpilogueIterationCountCheck->setName("iter.check");
7900 
7901   // Generate the code to check any assumptions that we've made for SCEV
7902   // expressions.
7903   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7904 
7905   // Generate the code that checks at runtime if arrays overlap. We put the
7906   // checks into a separate block to make the more common case of few elements
7907   // faster.
7908   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7909 
7910   // Generate the iteration count check for the main loop, *after* the check
7911   // for the epilogue loop, so that the path-length is shorter for the case
7912   // that goes directly through the vector epilogue. The longer-path length for
7913   // the main loop is compensated for, by the gain from vectorizing the larger
7914   // trip count. Note: the branch will get updated later on when we vectorize
7915   // the epilogue.
7916   EPI.MainLoopIterationCountCheck =
7917       emitIterationCountCheck(LoopScalarPreHeader, false);
7918 
7919   // Generate the induction variable.
7920   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7921 
7922   return LoopVectorPreHeader;
7923 }
7924 
7925 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7926   LLVM_DEBUG({
7927     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7928            << "Main Loop VF:" << EPI.MainLoopVF
7929            << ", Main Loop UF:" << EPI.MainLoopUF
7930            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7931            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7932   });
7933 }
7934 
7935 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7936   DEBUG_WITH_TYPE(VerboseDebug, {
7937     dbgs() << "intermediate fn:\n"
7938            << *OrigLoop->getHeader()->getParent() << "\n";
7939   });
7940 }
7941 
7942 BasicBlock *
7943 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7944                                                     bool ForEpilogue) {
7945   assert(Bypass && "Expected valid bypass basic block.");
7946   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7947   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7948   Value *Count = getTripCount();
7949   // Reuse existing vector loop preheader for TC checks.
7950   // Note that new preheader block is generated for vector loop.
7951   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7952   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7953 
7954   // Generate code to check if the loop's trip count is less than VF * UF of the
7955   // main vector loop.
7956   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7957                                                     : VF.isVector())
7958                ? ICmpInst::ICMP_ULE
7959                : ICmpInst::ICMP_ULT;
7960 
7961   Value *CheckMinIters = Builder.CreateICmp(
7962       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7963       "min.iters.check");
7964 
7965   if (!ForEpilogue)
7966     TCCheckBlock->setName("vector.main.loop.iter.check");
7967 
7968   // Create new preheader for vector loop.
7969   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7970                                    DT, LI, nullptr, "vector.ph");
7971 
7972   if (ForEpilogue) {
7973     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7974                                  DT->getNode(Bypass)->getIDom()) &&
7975            "TC check is expected to dominate Bypass");
7976 
7977     LoopBypassBlocks.push_back(TCCheckBlock);
7978 
7979     // Save the trip count so we don't have to regenerate it in the
7980     // vec.epilog.iter.check. This is safe to do because the trip count
7981     // generated here dominates the vector epilog iter check.
7982     EPI.TripCount = Count;
7983   }
7984 
7985   BranchInst &BI =
7986       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7987   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7988     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7989   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7990 
7991   introduceCheckBlockInVPlan(TCCheckBlock);
7992   return TCCheckBlock;
7993 }
7994 
7995 //===--------------------------------------------------------------------===//
7996 // EpilogueVectorizerEpilogueLoop
7997 //===--------------------------------------------------------------------===//
7998 
7999 /// This function is partially responsible for generating the control flow
8000 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8001 BasicBlock *
8002 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
8003     const SCEV2ValueTy &ExpandedSCEVs) {
8004   createVectorLoopSkeleton("vec.epilog.");
8005 
8006   // Now, compare the remaining count and if there aren't enough iterations to
8007   // execute the vectorized epilogue skip to the scalar part.
8008   LoopVectorPreHeader->setName("vec.epilog.ph");
8009   BasicBlock *VecEpilogueIterationCountCheck =
8010       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
8011                  nullptr, "vec.epilog.iter.check", true);
8012   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
8013                                           VecEpilogueIterationCountCheck);
8014   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
8015 
8016   // Adjust the control flow taking the state info from the main loop
8017   // vectorization into account.
8018   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8019          "expected this to be saved from the previous pass.");
8020   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8021       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8022 
8023   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8024       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8025 
8026   if (EPI.SCEVSafetyCheck)
8027     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8028         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8029   if (EPI.MemSafetyCheck)
8030     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8031         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8032 
8033   DT->changeImmediateDominator(LoopScalarPreHeader,
8034                                EPI.EpilogueIterationCountCheck);
8035   // Keep track of bypass blocks, as they feed start values to the induction and
8036   // reduction phis in the scalar loop preheader.
8037   if (EPI.SCEVSafetyCheck)
8038     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8039   if (EPI.MemSafetyCheck)
8040     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8041   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8042 
8043   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
8044   // reductions which merge control-flow from the latch block and the middle
8045   // block. Update the incoming values here and move the Phi into the preheader.
8046   SmallVector<PHINode *, 4> PhisInBlock;
8047   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8048     PhisInBlock.push_back(&Phi);
8049 
8050   for (PHINode *Phi : PhisInBlock) {
8051     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8052     Phi->replaceIncomingBlockWith(
8053         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8054         VecEpilogueIterationCountCheck);
8055 
8056     // If the phi doesn't have an incoming value from the
8057     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
8058     // value and also those from other check blocks. This is needed for
8059     // reduction phis only.
8060     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
8061           return EPI.EpilogueIterationCountCheck == IncB;
8062         }))
8063       continue;
8064     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8065     if (EPI.SCEVSafetyCheck)
8066       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8067     if (EPI.MemSafetyCheck)
8068       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8069   }
8070 
8071   // Generate bypass values from the additional bypass block. Note that when the
8072   // vectorized epilogue is skipped due to iteration count check, then the
8073   // resume value for the induction variable comes from the trip count of the
8074   // main vector loop, passed as the second argument.
8075   createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
8076   return LoopVectorPreHeader;
8077 }
8078 
8079 BasicBlock *
8080 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8081     BasicBlock *Bypass, BasicBlock *Insert) {
8082 
8083   assert(EPI.TripCount &&
8084          "Expected trip count to have been saved in the first pass.");
8085   assert(
8086       (!isa<Instruction>(EPI.TripCount) ||
8087        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8088       "saved trip count does not dominate insertion point.");
8089   Value *TC = EPI.TripCount;
8090   IRBuilder<> Builder(Insert->getTerminator());
8091   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8092 
8093   // Generate code to check if the loop's trip count is less than VF * UF of the
8094   // vector epilogue loop.
8095   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8096                ? ICmpInst::ICMP_ULE
8097                : ICmpInst::ICMP_ULT;
8098 
8099   Value *CheckMinIters =
8100       Builder.CreateICmp(P, Count,
8101                          createStepForVF(Builder, Count->getType(),
8102                                          EPI.EpilogueVF, EPI.EpilogueUF),
8103                          "min.epilog.iters.check");
8104 
8105   BranchInst &BI =
8106       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8107   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
8108     unsigned MainLoopStep = UF * VF.getKnownMinValue();
8109     unsigned EpilogueLoopStep =
8110         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
8111     // We assume the remaining `Count` is equally distributed in
8112     // [0, MainLoopStep)
8113     // So the probability for `Count < EpilogueLoopStep` should be
8114     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8115     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8116     const uint32_t Weights[] = {EstimatedSkipCount,
8117                                 MainLoopStep - EstimatedSkipCount};
8118     setBranchWeights(BI, Weights, /*IsExpected=*/false);
8119   }
8120   ReplaceInstWithInst(Insert->getTerminator(), &BI);
8121   LoopBypassBlocks.push_back(Insert);
8122 
8123   // A new entry block has been created for the epilogue VPlan. Hook it in, as
8124   // otherwise we would try to modify the entry to the main vector loop.
8125   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8126   VPBasicBlock *OldEntry = Plan.getEntry();
8127   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8128   Plan.setEntry(NewEntry);
8129   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8130 
8131   introduceCheckBlockInVPlan(Insert);
8132   return Insert;
8133 }
8134 
8135 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8136   LLVM_DEBUG({
8137     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8138            << "Epilogue Loop VF:" << EPI.EpilogueVF
8139            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8140   });
8141 }
8142 
8143 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8144   DEBUG_WITH_TYPE(VerboseDebug, {
8145     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8146   });
8147 }
8148 
8149 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8150 VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
8151   std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8152     return getVPValueOrAddLiveIn(Op);
8153   };
8154   return map_range(Operands, Fn);
8155 }
8156 
8157 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
8158   BasicBlock *Src = SI->getParent();
8159   assert(!OrigLoop->isLoopExiting(Src) &&
8160          all_of(successors(Src),
8161                 [this](BasicBlock *Succ) {
8162                   return OrigLoop->getHeader() != Succ;
8163                 }) &&
8164          "unsupported switch either exiting loop or continuing to header");
8165   // Create masks where the terminator in Src is a switch. We create mask for
8166   // all edges at the same time. This is more efficient, as we can create and
8167   // collect compares for all cases once.
8168   VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8169   BasicBlock *DefaultDst = SI->getDefaultDest();
8170   MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
8171   for (auto &C : SI->cases()) {
8172     BasicBlock *Dst = C.getCaseSuccessor();
8173     assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8174     // Cases whose destination is the same as default are redundant and can be
8175     // ignored - they will get there anyhow.
8176     if (Dst == DefaultDst)
8177       continue;
8178     auto &Compares = Dst2Compares[Dst];
8179     VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8180     Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8181   }
8182 
8183   // We need to handle 2 separate cases below for all entries in Dst2Compares,
8184   // which excludes destinations matching the default destination.
8185   VPValue *SrcMask = getBlockInMask(Src);
8186   VPValue *DefaultMask = nullptr;
8187   for (const auto &[Dst, Conds] : Dst2Compares) {
8188     // 1. Dst is not the default destination. Dst is reached if any of the cases
8189     // with destination == Dst are taken. Join the conditions for each case
8190     // whose destination == Dst using an OR.
8191     VPValue *Mask = Conds[0];
8192     for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8193       Mask = Builder.createOr(Mask, V);
8194     if (SrcMask)
8195       Mask = Builder.createLogicalAnd(SrcMask, Mask);
8196     EdgeMaskCache[{Src, Dst}] = Mask;
8197 
8198     // 2. Create the mask for the default destination, which is reached if none
8199     // of the cases with destination != default destination are taken. Join the
8200     // conditions for each case where the destination is != Dst using an OR and
8201     // negate it.
8202     DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8203   }
8204 
8205   if (DefaultMask) {
8206     DefaultMask = Builder.createNot(DefaultMask);
8207     if (SrcMask)
8208       DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8209   }
8210   EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8211 }
8212 
8213 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
8214   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8215 
8216   // Look for cached value.
8217   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8218   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8219   if (ECEntryIt != EdgeMaskCache.end())
8220     return ECEntryIt->second;
8221 
8222   if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8223     createSwitchEdgeMasks(SI);
8224     assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8225     return EdgeMaskCache[Edge];
8226   }
8227 
8228   VPValue *SrcMask = getBlockInMask(Src);
8229 
8230   // The terminator has to be a branch inst!
8231   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8232   assert(BI && "Unexpected terminator found");
8233   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8234     return EdgeMaskCache[Edge] = SrcMask;
8235 
8236   // If source is an exiting block, we know the exit edge is dynamically dead
8237   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8238   // adding uses of an otherwise potentially dead instruction unless we are
8239   // vectorizing a loop with uncountable exits. In that case, we always
8240   // materialize the mask.
8241   if (OrigLoop->isLoopExiting(Src) &&
8242       Src != Legal->getUncountableEarlyExitingBlock())
8243     return EdgeMaskCache[Edge] = SrcMask;
8244 
8245   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8246   assert(EdgeMask && "No Edge Mask found for condition");
8247 
8248   if (BI->getSuccessor(0) != Dst)
8249     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8250 
8251   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8252     // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8253     // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8254     // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8255     EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8256   }
8257 
8258   return EdgeMaskCache[Edge] = EdgeMask;
8259 }
8260 
8261 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
8262   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8263 
8264   // Look for cached value.
8265   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8266   EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8267   assert(ECEntryIt != EdgeMaskCache.end() &&
8268          "looking up mask for edge which has not been created");
8269   return ECEntryIt->second;
8270 }
8271 
8272 void VPRecipeBuilder::createHeaderMask() {
8273   BasicBlock *Header = OrigLoop->getHeader();
8274 
8275   // When not folding the tail, use nullptr to model all-true mask.
8276   if (!CM.foldTailByMasking()) {
8277     BlockMaskCache[Header] = nullptr;
8278     return;
8279   }
8280 
8281   // Introduce the early-exit compare IV <= BTC to form header block mask.
8282   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8283   // constructing the desired canonical IV in the header block as its first
8284   // non-phi instructions.
8285 
8286   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8287   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8288   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8289   HeaderVPBB->insert(IV, NewInsertionPoint);
8290 
8291   VPBuilder::InsertPointGuard Guard(Builder);
8292   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8293   VPValue *BlockMask = nullptr;
8294   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8295   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8296   BlockMaskCache[Header] = BlockMask;
8297 }
8298 
8299 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8300   // Return the cached value.
8301   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8302   assert(BCEntryIt != BlockMaskCache.end() &&
8303          "Trying to access mask for block without one.");
8304   return BCEntryIt->second;
8305 }
8306 
8307 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
8308   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8309   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8310   assert(OrigLoop->getHeader() != BB &&
8311          "Loop header must have cached block mask");
8312 
8313   // All-one mask is modelled as no-mask following the convention for masked
8314   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8315   VPValue *BlockMask = nullptr;
8316   // This is the block mask. We OR all unique incoming edges.
8317   for (auto *Predecessor :
8318        SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) {
8319     VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8320     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8321       BlockMaskCache[BB] = EdgeMask;
8322       return;
8323     }
8324 
8325     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8326       BlockMask = EdgeMask;
8327       continue;
8328     }
8329 
8330     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8331   }
8332 
8333   BlockMaskCache[BB] = BlockMask;
8334 }
8335 
8336 VPWidenMemoryRecipe *
8337 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8338                                   VFRange &Range) {
8339   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8340          "Must be called with either a load or store");
8341 
8342   auto WillWiden = [&](ElementCount VF) -> bool {
8343     LoopVectorizationCostModel::InstWidening Decision =
8344         CM.getWideningDecision(I, VF);
8345     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8346            "CM decision should be taken at this point.");
8347     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8348       return true;
8349     if (CM.isScalarAfterVectorization(I, VF) ||
8350         CM.isProfitableToScalarize(I, VF))
8351       return false;
8352     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8353   };
8354 
8355   if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
8356     return nullptr;
8357 
8358   VPValue *Mask = nullptr;
8359   if (Legal->isMaskRequired(I))
8360     Mask = getBlockInMask(I->getParent());
8361 
8362   // Determine if the pointer operand of the access is either consecutive or
8363   // reverse consecutive.
8364   LoopVectorizationCostModel::InstWidening Decision =
8365       CM.getWideningDecision(I, Range.Start);
8366   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8367   bool Consecutive =
8368       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8369 
8370   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8371   if (Consecutive) {
8372     auto *GEP = dyn_cast<GetElementPtrInst>(
8373         Ptr->getUnderlyingValue()->stripPointerCasts());
8374     VPSingleDefRecipe *VectorPtr;
8375     if (Reverse) {
8376       // When folding the tail, we may compute an address that we don't in the
8377       // original scalar loop and it may not be inbounds. Drop Inbounds in that
8378       // case.
8379       GEPNoWrapFlags Flags =
8380           (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8381               ? GEPNoWrapFlags::none()
8382               : GEPNoWrapFlags::inBounds();
8383       VectorPtr = new VPReverseVectorPointerRecipe(
8384           Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8385     } else {
8386       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8387                                             GEP ? GEP->getNoWrapFlags()
8388                                                 : GEPNoWrapFlags::none(),
8389                                             I->getDebugLoc());
8390     }
8391     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8392     Ptr = VectorPtr;
8393   }
8394   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8395     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8396                                  I->getDebugLoc());
8397 
8398   StoreInst *Store = cast<StoreInst>(I);
8399   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8400                                 Reverse, I->getDebugLoc());
8401 }
8402 
8403 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8404 /// insert a recipe to expand the step for the induction recipe.
8405 static VPWidenIntOrFpInductionRecipe *
8406 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8407                             VPValue *Start, const InductionDescriptor &IndDesc,
8408                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8409   assert(IndDesc.getStartValue() ==
8410          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8411   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8412          "step must be loop invariant");
8413 
8414   VPValue *Step =
8415       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8416   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8417     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8418                                              IndDesc, TruncI,
8419                                              TruncI->getDebugLoc());
8420   }
8421   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8422   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8423                                            IndDesc, Phi->getDebugLoc());
8424 }
8425 
8426 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8427     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
8428 
8429   // Check if this is an integer or fp induction. If so, build the recipe that
8430   // produces its scalar and vector values.
8431   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8432     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8433                                        *PSE.getSE(), *OrigLoop);
8434 
8435   // Check if this is pointer induction. If so, build the recipe for it.
8436   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8437     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8438                                                            *PSE.getSE());
8439     return new VPWidenPointerInductionRecipe(
8440         Phi, Operands[0], Step, *II,
8441         LoopVectorizationPlanner::getDecisionAndClampRange(
8442             [&](ElementCount VF) {
8443               return CM.isScalarAfterVectorization(Phi, VF);
8444             },
8445             Range),
8446         Phi->getDebugLoc());
8447   }
8448   return nullptr;
8449 }
8450 
8451 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8452     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
8453   // Optimize the special case where the source is a constant integer
8454   // induction variable. Notice that we can only optimize the 'trunc' case
8455   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8456   // (c) other casts depend on pointer size.
8457 
8458   // Determine whether \p K is a truncation based on an induction variable that
8459   // can be optimized.
8460   auto IsOptimizableIVTruncate =
8461       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8462     return [=](ElementCount VF) -> bool {
8463       return CM.isOptimizableIVTruncate(K, VF);
8464     };
8465   };
8466 
8467   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8468           IsOptimizableIVTruncate(I), Range)) {
8469 
8470     auto *Phi = cast<PHINode>(I->getOperand(0));
8471     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8472     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8473     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8474                                        *OrigLoop);
8475   }
8476   return nullptr;
8477 }
8478 
8479 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8480                                            ArrayRef<VPValue *> Operands) {
8481   unsigned NumIncoming = Phi->getNumIncomingValues();
8482 
8483   // We know that all PHIs in non-header blocks are converted into selects, so
8484   // we don't have to worry about the insertion order and we can just use the
8485   // builder. At this point we generate the predication tree. There may be
8486   // duplications since this is a simple recursive scan, but future
8487   // optimizations will clean it up.
8488   SmallVector<VPValue *, 2> OperandsWithMask;
8489 
8490   for (unsigned In = 0; In < NumIncoming; In++) {
8491     OperandsWithMask.push_back(Operands[In]);
8492     VPValue *EdgeMask =
8493         getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8494     if (!EdgeMask) {
8495       assert(In == 0 && "Both null and non-null edge masks found");
8496       assert(all_equal(Operands) &&
8497              "Distinct incoming values with one having a full mask");
8498       break;
8499     }
8500     OperandsWithMask.push_back(EdgeMask);
8501   }
8502   return new VPBlendRecipe(Phi, OperandsWithMask);
8503 }
8504 
8505 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8506                                                    ArrayRef<VPValue *> Operands,
8507                                                    VFRange &Range) {
8508   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8509       [this, CI](ElementCount VF) {
8510         return CM.isScalarWithPredication(CI, VF);
8511       },
8512       Range);
8513 
8514   if (IsPredicated)
8515     return nullptr;
8516 
8517   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8518   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8519              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8520              ID == Intrinsic::pseudoprobe ||
8521              ID == Intrinsic::experimental_noalias_scope_decl))
8522     return nullptr;
8523 
8524   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8525 
8526   // Is it beneficial to perform intrinsic call compared to lib call?
8527   bool ShouldUseVectorIntrinsic =
8528       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8529                 [&](ElementCount VF) -> bool {
8530                   return CM.getCallWideningDecision(CI, VF).Kind ==
8531                          LoopVectorizationCostModel::CM_IntrinsicCall;
8532                 },
8533                 Range);
8534   if (ShouldUseVectorIntrinsic)
8535     return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8536                                       CI->getDebugLoc());
8537 
8538   Function *Variant = nullptr;
8539   std::optional<unsigned> MaskPos;
8540   // Is better to call a vectorized version of the function than to to scalarize
8541   // the call?
8542   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8543       [&](ElementCount VF) -> bool {
8544         // The following case may be scalarized depending on the VF.
8545         // The flag shows whether we can use a usual Call for vectorized
8546         // version of the instruction.
8547 
8548         // If we've found a variant at a previous VF, then stop looking. A
8549         // vectorized variant of a function expects input in a certain shape
8550         // -- basically the number of input registers, the number of lanes
8551         // per register, and whether there's a mask required.
8552         // We store a pointer to the variant in the VPWidenCallRecipe, so
8553         // once we have an appropriate variant it's only valid for that VF.
8554         // This will force a different vplan to be generated for each VF that
8555         // finds a valid variant.
8556         if (Variant)
8557           return false;
8558         LoopVectorizationCostModel::CallWideningDecision Decision =
8559             CM.getCallWideningDecision(CI, VF);
8560         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8561           Variant = Decision.Variant;
8562           MaskPos = Decision.MaskPos;
8563           return true;
8564         }
8565 
8566         return false;
8567       },
8568       Range);
8569   if (ShouldUseVectorCall) {
8570     if (MaskPos.has_value()) {
8571       // We have 2 cases that would require a mask:
8572       //   1) The block needs to be predicated, either due to a conditional
8573       //      in the scalar loop or use of an active lane mask with
8574       //      tail-folding, and we use the appropriate mask for the block.
8575       //   2) No mask is required for the block, but the only available
8576       //      vector variant at this VF requires a mask, so we synthesize an
8577       //      all-true mask.
8578       VPValue *Mask = nullptr;
8579       if (Legal->isMaskRequired(CI))
8580         Mask = getBlockInMask(CI->getParent());
8581       else
8582         Mask = Plan.getOrAddLiveIn(
8583             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
8584 
8585       Ops.insert(Ops.begin() + *MaskPos, Mask);
8586     }
8587 
8588     Ops.push_back(Operands.back());
8589     return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8590   }
8591 
8592   return nullptr;
8593 }
8594 
8595 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8596   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8597          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8598   // Instruction should be widened, unless it is scalar after vectorization,
8599   // scalarization is profitable or it is predicated.
8600   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8601     return CM.isScalarAfterVectorization(I, VF) ||
8602            CM.isProfitableToScalarize(I, VF) ||
8603            CM.isScalarWithPredication(I, VF);
8604   };
8605   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8606                                                              Range);
8607 }
8608 
8609 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8610                                            ArrayRef<VPValue *> Operands,
8611                                            VPBasicBlock *VPBB) {
8612   switch (I->getOpcode()) {
8613   default:
8614     return nullptr;
8615   case Instruction::SDiv:
8616   case Instruction::UDiv:
8617   case Instruction::SRem:
8618   case Instruction::URem: {
8619     // If not provably safe, use a select to form a safe divisor before widening the
8620     // div/rem operation itself.  Otherwise fall through to general handling below.
8621     if (CM.isPredicatedInst(I)) {
8622       SmallVector<VPValue *> Ops(Operands);
8623       VPValue *Mask = getBlockInMask(I->getParent());
8624       VPValue *One =
8625           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8626       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8627       Ops[1] = SafeRHS;
8628       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8629     }
8630     [[fallthrough]];
8631   }
8632   case Instruction::Add:
8633   case Instruction::And:
8634   case Instruction::AShr:
8635   case Instruction::FAdd:
8636   case Instruction::FCmp:
8637   case Instruction::FDiv:
8638   case Instruction::FMul:
8639   case Instruction::FNeg:
8640   case Instruction::FRem:
8641   case Instruction::FSub:
8642   case Instruction::ICmp:
8643   case Instruction::LShr:
8644   case Instruction::Mul:
8645   case Instruction::Or:
8646   case Instruction::Select:
8647   case Instruction::Shl:
8648   case Instruction::Sub:
8649   case Instruction::Xor:
8650   case Instruction::Freeze:
8651     SmallVector<VPValue *> NewOps(Operands);
8652     if (Instruction::isBinaryOp(I->getOpcode())) {
8653       // The legacy cost model uses SCEV to check if some of the operands are
8654       // constants. To match the legacy cost model's behavior, use SCEV to try
8655       // to replace operands with constants.
8656       ScalarEvolution &SE = *PSE.getSE();
8657       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8658         Value *V = Op->getUnderlyingValue();
8659         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8660           return Op;
8661         auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8662         if (!C)
8663           return Op;
8664         return Plan.getOrAddLiveIn(C->getValue());
8665       };
8666       // For Mul, the legacy cost model checks both operands.
8667       if (I->getOpcode() == Instruction::Mul)
8668         NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8669       // For other binops, the legacy cost model only checks the second operand.
8670       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8671     }
8672     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8673   };
8674 }
8675 
8676 VPHistogramRecipe *
8677 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8678                                      ArrayRef<VPValue *> Operands) {
8679   // FIXME: Support other operations.
8680   unsigned Opcode = HI->Update->getOpcode();
8681   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8682          "Histogram update operation must be an Add or Sub");
8683 
8684   SmallVector<VPValue *, 3> HGramOps;
8685   // Bucket address.
8686   HGramOps.push_back(Operands[1]);
8687   // Increment value.
8688   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8689 
8690   // In case of predicated execution (due to tail-folding, or conditional
8691   // execution, or both), pass the relevant mask.
8692   if (Legal->isMaskRequired(HI->Store))
8693     HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8694 
8695   return new VPHistogramRecipe(Opcode,
8696                                make_range(HGramOps.begin(), HGramOps.end()),
8697                                HI->Store->getDebugLoc());
8698 }
8699 
8700 void VPRecipeBuilder::fixHeaderPhis() {
8701   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8702   for (VPHeaderPHIRecipe *R : PhisToFix) {
8703     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8704     VPRecipeBase *IncR =
8705         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8706     R->addOperand(IncR->getVPSingleValue());
8707   }
8708 }
8709 
8710 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8711                                                       VFRange &Range) {
8712   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8713       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8714       Range);
8715 
8716   bool IsPredicated = CM.isPredicatedInst(I);
8717 
8718   // Even if the instruction is not marked as uniform, there are certain
8719   // intrinsic calls that can be effectively treated as such, so we check for
8720   // them here. Conservatively, we only do this for scalable vectors, since
8721   // for fixed-width VFs we can always fall back on full scalarization.
8722   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8723     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8724     case Intrinsic::assume:
8725     case Intrinsic::lifetime_start:
8726     case Intrinsic::lifetime_end:
8727       // For scalable vectors if one of the operands is variant then we still
8728       // want to mark as uniform, which will generate one instruction for just
8729       // the first lane of the vector. We can't scalarize the call in the same
8730       // way as for fixed-width vectors because we don't know how many lanes
8731       // there are.
8732       //
8733       // The reasons for doing it this way for scalable vectors are:
8734       //   1. For the assume intrinsic generating the instruction for the first
8735       //      lane is still be better than not generating any at all. For
8736       //      example, the input may be a splat across all lanes.
8737       //   2. For the lifetime start/end intrinsics the pointer operand only
8738       //      does anything useful when the input comes from a stack object,
8739       //      which suggests it should always be uniform. For non-stack objects
8740       //      the effect is to poison the object, which still allows us to
8741       //      remove the call.
8742       IsUniform = true;
8743       break;
8744     default:
8745       break;
8746     }
8747   }
8748   VPValue *BlockInMask = nullptr;
8749   if (!IsPredicated) {
8750     // Finalize the recipe for Instr, first if it is not predicated.
8751     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8752   } else {
8753     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8754     // Instructions marked for predication are replicated and a mask operand is
8755     // added initially. Masked replicate recipes will later be placed under an
8756     // if-then construct to prevent side-effects. Generate recipes to compute
8757     // the block mask for this region.
8758     BlockInMask = getBlockInMask(I->getParent());
8759   }
8760 
8761   // Note that there is some custom logic to mark some intrinsics as uniform
8762   // manually above for scalable vectors, which this assert needs to account for
8763   // as well.
8764   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8765           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8766          "Should not predicate a uniform recipe");
8767   auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8768                                        IsUniform, BlockInMask);
8769   return Recipe;
8770 }
8771 
8772 /// Find all possible partial reductions in the loop and track all of those that
8773 /// are valid so recipes can be formed later.
8774 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8775   // Find all possible partial reductions.
8776   SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
8777       PartialReductionChains;
8778   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
8779     if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8780             getScaledReduction(Phi, RdxDesc, Range))
8781       PartialReductionChains.push_back(*Pair);
8782 
8783   // A partial reduction is invalid if any of its extends are used by
8784   // something that isn't another partial reduction. This is because the
8785   // extends are intended to be lowered along with the reduction itself.
8786 
8787   // Build up a set of partial reduction bin ops for efficient use checking.
8788   SmallSet<User *, 4> PartialReductionBinOps;
8789   for (const auto &[PartialRdx, _] : PartialReductionChains)
8790     PartialReductionBinOps.insert(PartialRdx.BinOp);
8791 
8792   auto ExtendIsOnlyUsedByPartialReductions =
8793       [&PartialReductionBinOps](Instruction *Extend) {
8794         return all_of(Extend->users(), [&](const User *U) {
8795           return PartialReductionBinOps.contains(U);
8796         });
8797       };
8798 
8799   // Check if each use of a chain's two extends is a partial reduction
8800   // and only add those that don't have non-partial reduction users.
8801   for (auto Pair : PartialReductionChains) {
8802     PartialReductionChain Chain = Pair.first;
8803     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8804         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8805       ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
8806   }
8807 }
8808 
8809 std::optional<std::pair<PartialReductionChain, unsigned>>
8810 VPRecipeBuilder::getScaledReduction(PHINode *PHI,
8811                                     const RecurrenceDescriptor &Rdx,
8812                                     VFRange &Range) {
8813   // TODO: Allow scaling reductions when predicating. The select at
8814   // the end of the loop chooses between the phi value and most recent
8815   // reduction result, both of which have different VFs to the active lane
8816   // mask when scaling.
8817   if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
8818     return std::nullopt;
8819 
8820   auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
8821   if (!Update)
8822     return std::nullopt;
8823 
8824   Value *Op = Update->getOperand(0);
8825   Value *PhiOp = Update->getOperand(1);
8826   if (Op == PHI) {
8827     Op = Update->getOperand(1);
8828     PhiOp = Update->getOperand(0);
8829   }
8830   if (PhiOp != PHI)
8831     return std::nullopt;
8832 
8833   auto *BinOp = dyn_cast<BinaryOperator>(Op);
8834   if (!BinOp || !BinOp->hasOneUse())
8835     return std::nullopt;
8836 
8837   using namespace llvm::PatternMatch;
8838   Value *A, *B;
8839   if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8840       !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8841     return std::nullopt;
8842 
8843   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8844   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8845 
8846   TTI::PartialReductionExtendKind OpAExtend =
8847       TargetTransformInfo::getPartialReductionExtendKind(ExtA);
8848   TTI::PartialReductionExtendKind OpBExtend =
8849       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
8850 
8851   PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
8852 
8853   unsigned TargetScaleFactor =
8854       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8855           A->getType()->getPrimitiveSizeInBits());
8856 
8857   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8858           [&](ElementCount VF) {
8859             InstructionCost Cost = TTI->getPartialReductionCost(
8860                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8861                 VF, OpAExtend, OpBExtend,
8862                 std::make_optional(BinOp->getOpcode()));
8863             return Cost.isValid();
8864           },
8865           Range))
8866     return std::make_pair(Chain, TargetScaleFactor);
8867 
8868   return std::nullopt;
8869 }
8870 
8871 VPRecipeBase *
8872 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8873                                         ArrayRef<VPValue *> Operands,
8874                                         VFRange &Range, VPBasicBlock *VPBB) {
8875   // First, check for specific widening recipes that deal with inductions, Phi
8876   // nodes, calls and memory operations.
8877   VPRecipeBase *Recipe;
8878   if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8879     if (Phi->getParent() != OrigLoop->getHeader())
8880       return tryToBlend(Phi, Operands);
8881 
8882     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8883       return Recipe;
8884 
8885     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8886     assert((Legal->isReductionVariable(Phi) ||
8887             Legal->isFixedOrderRecurrence(Phi)) &&
8888            "can only widen reductions and fixed-order recurrences here");
8889     VPValue *StartV = Operands[0];
8890     if (Legal->isReductionVariable(Phi)) {
8891       const RecurrenceDescriptor &RdxDesc =
8892           Legal->getReductionVars().find(Phi)->second;
8893       assert(RdxDesc.getRecurrenceStartValue() ==
8894              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8895 
8896       // If the PHI is used by a partial reduction, set the scale factor.
8897       std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8898           getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
8899       unsigned ScaleFactor = Pair ? Pair->second : 1;
8900       PhiRecipe = new VPReductionPHIRecipe(
8901           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8902           CM.useOrderedReductions(RdxDesc), ScaleFactor);
8903     } else {
8904       // TODO: Currently fixed-order recurrences are modeled as chains of
8905       // first-order recurrences. If there are no users of the intermediate
8906       // recurrences in the chain, the fixed order recurrence should be modeled
8907       // directly, enabling more efficient codegen.
8908       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8909     }
8910 
8911     PhisToFix.push_back(PhiRecipe);
8912     return PhiRecipe;
8913   }
8914 
8915   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8916                                     cast<TruncInst>(Instr), Operands, Range)))
8917     return Recipe;
8918 
8919   // All widen recipes below deal only with VF > 1.
8920   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8921           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8922     return nullptr;
8923 
8924   if (auto *CI = dyn_cast<CallInst>(Instr))
8925     return tryToWidenCall(CI, Operands, Range);
8926 
8927   if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8928     if (auto HistInfo = Legal->getHistogramInfo(SI))
8929       return tryToWidenHistogram(*HistInfo, Operands);
8930 
8931   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8932     return tryToWidenMemory(Instr, Operands, Range);
8933 
8934   if (getScaledReductionForInstr(Instr))
8935     return tryToCreatePartialReduction(Instr, Operands);
8936 
8937   if (!shouldWiden(Instr, Range))
8938     return nullptr;
8939 
8940   if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8941     return new VPWidenGEPRecipe(GEP,
8942                                 make_range(Operands.begin(), Operands.end()));
8943 
8944   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8945     return new VPWidenSelectRecipe(
8946         *SI, make_range(Operands.begin(), Operands.end()));
8947   }
8948 
8949   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8950     return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8951                                  *CI);
8952   }
8953 
8954   return tryToWiden(Instr, Operands, VPBB);
8955 }
8956 
8957 VPRecipeBase *
8958 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8959                                              ArrayRef<VPValue *> Operands) {
8960   assert(Operands.size() == 2 &&
8961          "Unexpected number of operands for partial reduction");
8962 
8963   VPValue *BinOp = Operands[0];
8964   VPValue *Phi = Operands[1];
8965   if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
8966     std::swap(BinOp, Phi);
8967 
8968   return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
8969                                       Reduction);
8970 }
8971 
8972 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8973                                                         ElementCount MaxVF) {
8974   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8975 
8976   auto MaxVFTimes2 = MaxVF * 2;
8977   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8978     VFRange SubRange = {VF, MaxVFTimes2};
8979     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8980       // Now optimize the initial VPlan.
8981       if (!Plan->hasVF(ElementCount::getFixed(1)))
8982         VPlanTransforms::truncateToMinimalBitwidths(*Plan,
8983                                                     CM.getMinimalBitwidths());
8984       VPlanTransforms::optimize(*Plan);
8985       // TODO: try to put it close to addActiveLaneMask().
8986       // Discard the plan if it is not EVL-compatible
8987       if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
8988                                       *Plan, CM.getMaxSafeElements()))
8989         break;
8990       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8991       VPlans.push_back(std::move(Plan));
8992     }
8993     VF = SubRange.End;
8994   }
8995 }
8996 
8997 // Add the necessary canonical IV and branch recipes required to control the
8998 // loop.
8999 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
9000                                   DebugLoc DL) {
9001   Value *StartIdx = ConstantInt::get(IdxTy, 0);
9002   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
9003 
9004   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
9005   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
9006   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
9007   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
9008   Header->insert(CanonicalIVPHI, Header->begin());
9009 
9010   VPBuilder Builder(TopRegion->getExitingBasicBlock());
9011   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
9012   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
9013       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
9014       "index.next");
9015   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
9016 
9017   // Add the BranchOnCount VPInstruction to the latch.
9018   Builder.createNaryOp(VPInstruction::BranchOnCount,
9019                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
9020 }
9021 
9022 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
9023 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
9024 /// the end value of the induction.
9025 static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
9026                                                VPBuilder &VectorPHBuilder,
9027                                                VPBuilder &ScalarPHBuilder,
9028                                                VPTypeAnalysis &TypeInfo,
9029                                                VPValue *VectorTC) {
9030   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
9031   // Truncated wide inductions resume from the last lane of their vector value
9032   // in the last vector iteration which is handled elsewhere.
9033   if (WideIntOrFp && WideIntOrFp->getTruncInst())
9034     return nullptr;
9035 
9036   VPValue *Start = WideIV->getStartValue();
9037   VPValue *Step = WideIV->getStepValue();
9038   const InductionDescriptor &ID = WideIV->getInductionDescriptor();
9039   VPValue *EndValue = VectorTC;
9040   if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
9041     EndValue = VectorPHBuilder.createDerivedIV(
9042         ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
9043         Start, VectorTC, Step);
9044   }
9045 
9046   // EndValue is derived from the vector trip count (which has the same type as
9047   // the widest induction) and thus may be wider than the induction here.
9048   Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
9049   if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
9050     EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
9051                                                 ScalarTypeOfWideIV,
9052                                                 WideIV->getDebugLoc());
9053   }
9054 
9055   auto *ResumePhiRecipe =
9056       ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
9057                                    WideIV->getDebugLoc(), "bc.resume.val");
9058   return ResumePhiRecipe;
9059 }
9060 
9061 /// Create resume phis in the scalar preheader for first-order recurrences,
9062 /// reductions and inductions, and update the VPIRInstructions wrapping the
9063 /// original phis in the scalar header.
9064 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
9065   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9066   auto *ScalarPH = Plan.getScalarPreheader();
9067   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
9068   VPBuilder VectorPHBuilder(
9069       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
9070   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9071   VPBuilder ScalarPHBuilder(ScalarPH);
9072   VPValue *OneVPV = Plan.getOrAddLiveIn(
9073       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
9074   for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
9075     auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9076     auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9077     if (!ScalarPhiI)
9078       break;
9079 
9080     auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9081     if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9082       if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
9083               WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9084               &Plan.getVectorTripCount())) {
9085         ScalarPhiIRI->addOperand(ResumePhi);
9086         continue;
9087       }
9088       // TODO: Also handle truncated inductions here. Computing end-values
9089       // separately should be done as VPlan-to-VPlan optimization, after
9090       // legalizing all resume values to use the last lane from the loop.
9091       assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9092              "should only skip truncated wide inductions");
9093       continue;
9094     }
9095 
9096     // The backedge value provides the value to resume coming out of a loop,
9097     // which for FORs is a vector whose last element needs to be extracted. The
9098     // start value provides the value if the loop is bypassed.
9099     bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9100     auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9101     if (IsFOR)
9102       ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9103           VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9104           "vector.recur.extract");
9105     StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9106     auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9107         VPInstruction::ResumePhi,
9108         {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9109     ScalarPhiIRI->addOperand(ResumePhiR);
9110   }
9111 }
9112 
9113 /// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
9114 /// either an untruncated wide induction, or if it increments a wide induction
9115 /// by its step.
9116 static bool isOptimizableIVOrUse(VPValue *VPV) {
9117   VPRecipeBase *Def = VPV->getDefiningRecipe();
9118   if (!Def)
9119     return false;
9120   auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
9121   if (WideIV) {
9122     // VPV itself is a wide induction, separately compute the end value for exit
9123     // users if it is not a truncated IV.
9124     return isa<VPWidenPointerInductionRecipe>(WideIV) ||
9125            !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
9126   }
9127 
9128   // Check if VPV is an optimizable induction increment.
9129   if (Def->getNumOperands() != 2)
9130     return false;
9131   WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
9132   if (!WideIV)
9133     WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
9134   if (!WideIV)
9135     return false;
9136 
9137   using namespace VPlanPatternMatch;
9138   auto &ID = WideIV->getInductionDescriptor();
9139 
9140   // Check if VPV increments the induction by the induction step.
9141   VPValue *IVStep = WideIV->getStepValue();
9142   switch (ID.getInductionOpcode()) {
9143   case Instruction::Add:
9144     return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
9145                                                    m_Specific(IVStep)));
9146   case Instruction::FAdd:
9147     return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
9148                                                     m_Specific(IVStep)));
9149   case Instruction::FSub:
9150     return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
9151                                                   m_Specific(IVStep)));
9152   case Instruction::Sub: {
9153     // IVStep will be the negated step of the subtraction. Check if Step == -1 *
9154     // IVStep.
9155     VPValue *Step;
9156     if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
9157         !Step->isLiveIn() || !IVStep->isLiveIn())
9158       return false;
9159     auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
9160     auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
9161     return StepCI && IVStepCI &&
9162            StepCI->getValue() == (-1 * IVStepCI->getValue());
9163   }
9164   default:
9165     return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
9166            match(VPV, m_GetElementPtr(m_Specific(WideIV),
9167                                       m_Specific(WideIV->getStepValue())));
9168   }
9169   llvm_unreachable("should have been covered by switch above");
9170 }
9171 
9172 // Collect VPIRInstructions for phis in the exit blocks that are modeled
9173 // in VPlan and add the exiting VPValue as operand. Some exiting values are not
9174 // modeled explicitly yet and won't be included. Those are un-truncated
9175 // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
9176 // increments.
9177 static SetVector<VPIRInstruction *>
9178 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
9179                          VPlan &Plan) {
9180   auto *MiddleVPBB = Plan.getMiddleBlock();
9181   SetVector<VPIRInstruction *> ExitUsersToFix;
9182   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9183     for (VPRecipeBase &R : *ExitVPBB) {
9184       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9185       if (!ExitIRI)
9186         continue;
9187       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9188       if (!ExitPhi)
9189         break;
9190       for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
9191         BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9192         if (PredVPBB != MiddleVPBB) {
9193           SmallVector<BasicBlock *> ExitingBlocks;
9194           OrigLoop->getExitingBlocks(ExitingBlocks);
9195           assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
9196           ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
9197                                                     : ExitingBlocks[0];
9198         }
9199         Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9200         VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9201         // Exit values for inductions are computed and updated outside of VPlan
9202         // and independent of induction recipes.
9203         // TODO: Compute induction exit values in VPlan.
9204         if (isOptimizableIVOrUse(V) &&
9205             ExitVPBB->getSinglePredecessor() == MiddleVPBB)
9206           continue;
9207         ExitUsersToFix.insert(ExitIRI);
9208         ExitIRI->addOperand(V);
9209       }
9210     }
9211   }
9212   return ExitUsersToFix;
9213 }
9214 
9215 // Add exit values to \p Plan. Extracts are added for each entry in \p
9216 // ExitUsersToFix if needed and their operands are updated. Returns true if all
9217 // exit users can be handled, otherwise return false.
9218 static bool
9219 addUsersInExitBlocks(VPlan &Plan,
9220                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9221   if (ExitUsersToFix.empty())
9222     return true;
9223 
9224   auto *MiddleVPBB = Plan.getMiddleBlock();
9225   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9226 
9227   // Introduce extract for exiting values and update the VPIRInstructions
9228   // modeling the corresponding LCSSA phis.
9229   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9230     for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
9231       // Pass live-in values used by exit phis directly through to their users
9232       // in the exit block.
9233       if (Op->isLiveIn())
9234         continue;
9235 
9236       // Currently only live-ins can be used by exit values from blocks not
9237       // exiting via the vector latch through to the middle block.
9238       if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9239         return false;
9240 
9241       LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
9242       VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
9243                                     {Op, Plan.getOrAddLiveIn(ConstantInt::get(
9244                                              IntegerType::get(Ctx, 32), 1))});
9245       ExitIRI->setOperand(Idx, Ext);
9246     }
9247   }
9248   return true;
9249 }
9250 
9251 /// Handle users in the exit block for first order reductions in the original
9252 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9253 /// users in the original exit block using the VPIRInstruction wrapping to the
9254 /// LCSSA phi.
9255 static void addExitUsersForFirstOrderRecurrences(
9256     VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9257   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9258   auto *ScalarPHVPBB = Plan.getScalarPreheader();
9259   auto *MiddleVPBB = Plan.getMiddleBlock();
9260   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9261   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9262   VPValue *TwoVPV = Plan.getOrAddLiveIn(
9263       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9264 
9265   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9266     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9267     if (!FOR)
9268       continue;
9269 
9270     // This is the second phase of vectorizing first-order recurrences, creating
9271     // extract for users outside the loop. An overview of the transformation is
9272     // described below. Suppose we have the following loop with some use after
9273     // the loop of the last a[i-1],
9274     //
9275     //   for (int i = 0; i < n; ++i) {
9276     //     t = a[i - 1];
9277     //     b[i] = a[i] - t;
9278     //   }
9279     //   use t;
9280     //
9281     // There is a first-order recurrence on "a". For this loop, the shorthand
9282     // scalar IR looks like:
9283     //
9284     //   scalar.ph:
9285     //     s.init = a[-1]
9286     //     br scalar.body
9287     //
9288     //   scalar.body:
9289     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9290     //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9291     //     s2 = a[i]
9292     //     b[i] = s2 - s1
9293     //     br cond, scalar.body, exit.block
9294     //
9295     //   exit.block:
9296     //     use = lcssa.phi [s1, scalar.body]
9297     //
9298     // In this example, s1 is a recurrence because it's value depends on the
9299     // previous iteration. In the first phase of vectorization, we created a
9300     // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9301     // for users in the scalar preheader and exit block.
9302     //
9303     //   vector.ph:
9304     //     v_init = vector(..., ..., ..., a[-1])
9305     //     br vector.body
9306     //
9307     //   vector.body
9308     //     i = phi [0, vector.ph], [i+4, vector.body]
9309     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
9310     //     v2 = a[i, i+1, i+2, i+3]
9311     //     b[i] = v2 - v1
9312     //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9313     //     b[i, i+1, i+2, i+3] = v2 - v1
9314     //     br cond, vector.body, middle.block
9315     //
9316     //   middle.block:
9317     //     vector.recur.extract.for.phi = v2(2)
9318     //     vector.recur.extract = v2(3)
9319     //     br cond, scalar.ph, exit.block
9320     //
9321     //   scalar.ph:
9322     //     scalar.recur.init = phi [vector.recur.extract, middle.block],
9323     //                             [s.init, otherwise]
9324     //     br scalar.body
9325     //
9326     //   scalar.body:
9327     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9328     //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9329     //     s2 = a[i]
9330     //     b[i] = s2 - s1
9331     //     br cond, scalar.body, exit.block
9332     //
9333     //   exit.block:
9334     //     lo = lcssa.phi [s1, scalar.body],
9335     //                    [vector.recur.extract.for.phi, middle.block]
9336     //
9337     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9338     // Extract the penultimate value of the recurrence and use it as operand for
9339     // the VPIRInstruction modeling the phi.
9340     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9341       if (ExitIRI->getOperand(0) != FOR)
9342         continue;
9343       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9344           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9345           "vector.recur.extract.for.phi");
9346       ExitIRI->setOperand(0, PenultimateElement);
9347       ExitUsersToFix.remove(ExitIRI);
9348     }
9349   }
9350 }
9351 
9352 VPlanPtr
9353 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9354 
9355   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9356 
9357   // ---------------------------------------------------------------------------
9358   // Build initial VPlan: Scan the body of the loop in a topological order to
9359   // visit each basic block after having visited its predecessor basic blocks.
9360   // ---------------------------------------------------------------------------
9361 
9362   // Create initial VPlan skeleton, having a basic block for the pre-header
9363   // which contains SCEV expansions that need to happen before the CFG is
9364   // modified; a basic block for the vector pre-header, followed by a region for
9365   // the vector loop, followed by the middle basic block. The skeleton vector
9366   // loop region contains a header and latch basic blocks.
9367 
9368   bool RequiresScalarEpilogueCheck =
9369       LoopVectorizationPlanner::getDecisionAndClampRange(
9370           [this](ElementCount VF) {
9371             return !CM.requiresScalarEpilogue(VF.isVector());
9372           },
9373           Range);
9374   VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(),
9375                                             PSE, RequiresScalarEpilogueCheck,
9376                                             CM.foldTailByMasking(), OrigLoop);
9377 
9378   // Don't use getDecisionAndClampRange here, because we don't know the UF
9379   // so this function is better to be conservative, rather than to split
9380   // it up into different VPlans.
9381   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9382   bool IVUpdateMayOverflow = false;
9383   for (ElementCount VF : Range)
9384     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9385 
9386   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9387   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9388   // Use NUW for the induction increment if we proved that it won't overflow in
9389   // the vector loop or when not folding the tail. In the later case, we know
9390   // that the canonical induction increment will not overflow as the vector trip
9391   // count is >= increment and a multiple of the increment.
9392   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9393   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9394 
9395   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9396                                 Builder);
9397 
9398   // ---------------------------------------------------------------------------
9399   // Pre-construction: record ingredients whose recipes we'll need to further
9400   // process after constructing the initial VPlan.
9401   // ---------------------------------------------------------------------------
9402 
9403   // For each interleave group which is relevant for this (possibly trimmed)
9404   // Range, add it to the set of groups to be later applied to the VPlan and add
9405   // placeholders for its members' Recipes which we'll be replacing with a
9406   // single VPInterleaveRecipe.
9407   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9408     auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9409       bool Result = (VF.isVector() && // Query is illegal for VF == 1
9410                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
9411                          LoopVectorizationCostModel::CM_Interleave);
9412       // For scalable vectors, the only interleave factor currently supported
9413       // is 2 since we require the (de)interleave2 intrinsics instead of
9414       // shufflevectors.
9415       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9416              "Unsupported interleave factor for scalable vectors");
9417       return Result;
9418     };
9419     if (!getDecisionAndClampRange(ApplyIG, Range))
9420       continue;
9421     InterleaveGroups.insert(IG);
9422   }
9423 
9424   // ---------------------------------------------------------------------------
9425   // Construct recipes for the instructions in the loop
9426   // ---------------------------------------------------------------------------
9427 
9428   // Scan the body of the loop in a topological order to visit each basic block
9429   // after having visited its predecessor basic blocks.
9430   LoopBlocksDFS DFS(OrigLoop);
9431   DFS.perform(LI);
9432 
9433   VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9434   VPBasicBlock *VPBB = HeaderVPBB;
9435   BasicBlock *HeaderBB = OrigLoop->getHeader();
9436   bool NeedsMasks =
9437       CM.foldTailByMasking() ||
9438       any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9439         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9440         return Legal->blockNeedsPredication(BB) || NeedsBlends;
9441       });
9442 
9443   RecipeBuilder.collectScaledReductions(Range);
9444 
9445   auto *MiddleVPBB = Plan->getMiddleBlock();
9446   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9447   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9448     // Relevant instructions from basic block BB will be grouped into VPRecipe
9449     // ingredients and fill a new VPBasicBlock.
9450     if (VPBB != HeaderVPBB)
9451       VPBB->setName(BB->getName());
9452     Builder.setInsertPoint(VPBB);
9453 
9454     if (VPBB == HeaderVPBB)
9455       RecipeBuilder.createHeaderMask();
9456     else if (NeedsMasks)
9457       RecipeBuilder.createBlockInMask(BB);
9458 
9459     // Introduce each ingredient into VPlan.
9460     // TODO: Model and preserve debug intrinsics in VPlan.
9461     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9462       Instruction *Instr = &I;
9463       SmallVector<VPValue *, 4> Operands;
9464       auto *Phi = dyn_cast<PHINode>(Instr);
9465       if (Phi && Phi->getParent() == HeaderBB) {
9466         Operands.push_back(Plan->getOrAddLiveIn(
9467             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9468       } else {
9469         auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9470         Operands = {OpRange.begin(), OpRange.end()};
9471       }
9472 
9473       // The stores with invariant address inside the loop will be deleted, and
9474       // in the exit block, a uniform store recipe will be created for the final
9475       // invariant store of the reduction.
9476       StoreInst *SI;
9477       if ((SI = dyn_cast<StoreInst>(&I)) &&
9478           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9479         // Only create recipe for the final invariant store of the reduction.
9480         if (!Legal->isInvariantStoreOfReduction(SI))
9481           continue;
9482         auto *Recipe = new VPReplicateRecipe(
9483             SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9484             true /* IsUniform */);
9485         Recipe->insertBefore(*MiddleVPBB, MBIP);
9486         continue;
9487       }
9488 
9489       VPRecipeBase *Recipe =
9490           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9491       if (!Recipe)
9492         Recipe = RecipeBuilder.handleReplication(Instr, Range);
9493 
9494       RecipeBuilder.setRecipe(Instr, Recipe);
9495       if (isa<VPHeaderPHIRecipe>(Recipe)) {
9496         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9497         // the following cases, VPHeaderPHIRecipes may be created after non-phi
9498         // recipes and need to be moved to the phi section of HeaderVPBB:
9499         // * tail-folding (non-phi recipes computing the header mask are
9500         // introduced earlier than regular header phi recipes, and should appear
9501         // after them)
9502         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9503 
9504         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9505                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9506                "unexpected recipe needs moving");
9507         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9508       } else
9509         VPBB->appendRecipe(Recipe);
9510     }
9511 
9512     VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9513     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9514   }
9515 
9516   // After here, VPBB should not be used.
9517   VPBB = nullptr;
9518 
9519   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9520          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9521          "entry block must be set to a VPRegionBlock having a non-empty entry "
9522          "VPBasicBlock");
9523   RecipeBuilder.fixHeaderPhis();
9524 
9525   // Update wide induction increments to use the same step as the corresponding
9526   // wide induction. This enables detecting induction increments directly in
9527   // VPlan and removes redundant splats.
9528   for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9529     auto *IVInc = cast<Instruction>(
9530         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9531     if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9532       continue;
9533     VPWidenInductionRecipe *WideIV =
9534         cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9535     VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9536     R->setOperand(1, WideIV->getStepValue());
9537   }
9538 
9539   if (auto *UncountableExitingBlock =
9540           Legal->getUncountableEarlyExitingBlock()) {
9541     VPlanTransforms::handleUncountableEarlyExit(
9542         *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9543   }
9544   addScalarResumePhis(RecipeBuilder, *Plan);
9545   SetVector<VPIRInstruction *> ExitUsersToFix =
9546       collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9547   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9548   if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9549     reportVectorizationFailure(
9550         "Some exit values in loop with uncountable exit not supported yet",
9551         "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9552     return nullptr;
9553   }
9554 
9555   // ---------------------------------------------------------------------------
9556   // Transform initial VPlan: Apply previously taken decisions, in order, to
9557   // bring the VPlan to its final state.
9558   // ---------------------------------------------------------------------------
9559 
9560   // Adjust the recipes for any inloop reductions.
9561   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9562 
9563   // Interleave memory: for each Interleave Group we marked earlier as relevant
9564   // for this VPlan, replace the Recipes widening its memory instructions with a
9565   // single VPInterleaveRecipe at its insertion point.
9566   VPlanTransforms::createInterleaveGroups(
9567       *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
9568 
9569   for (ElementCount VF : Range)
9570     Plan->addVF(VF);
9571   Plan->setName("Initial VPlan");
9572 
9573   // Replace VPValues for known constant strides guaranteed by predicate scalar
9574   // evolution.
9575   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9576     auto *R = cast<VPRecipeBase>(&U);
9577     return R->getParent()->getParent() ||
9578            R->getParent() ==
9579                Plan->getVectorLoopRegion()->getSinglePredecessor();
9580   };
9581   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9582     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9583     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9584     // Only handle constant strides for now.
9585     if (!ScevStride)
9586       continue;
9587 
9588     auto *CI = Plan->getOrAddLiveIn(
9589         ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9590     if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9591       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9592 
9593     // The versioned value may not be used in the loop directly but through a
9594     // sext/zext. Add new live-ins in those cases.
9595     for (Value *U : StrideV->users()) {
9596       if (!isa<SExtInst, ZExtInst>(U))
9597         continue;
9598       VPValue *StrideVPV = Plan->getLiveIn(U);
9599       if (!StrideVPV)
9600         continue;
9601       unsigned BW = U->getType()->getScalarSizeInBits();
9602       APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9603                                  : ScevStride->getAPInt().zext(BW);
9604       VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9605       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9606     }
9607   }
9608 
9609   VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
9610     return Legal->blockNeedsPredication(BB);
9611   });
9612 
9613   // Sink users of fixed-order recurrence past the recipe defining the previous
9614   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9615   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
9616     return nullptr;
9617 
9618   if (useActiveLaneMask(Style)) {
9619     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9620     // TailFoldingStyle is visible there.
9621     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9622     bool WithoutRuntimeCheck =
9623         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
9624     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9625                                        WithoutRuntimeCheck);
9626   }
9627   return Plan;
9628 }
9629 
9630 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9631   // Outer loop handling: They may require CFG and instruction level
9632   // transformations before even evaluating whether vectorization is profitable.
9633   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9634   // the vectorization pipeline.
9635   assert(!OrigLoop->isInnermost());
9636   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9637 
9638   // Create new empty VPlan
9639   auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9640                                         true, false, OrigLoop);
9641 
9642   // Build hierarchical CFG
9643   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9644   HCFGBuilder.buildHierarchicalCFG();
9645 
9646   for (ElementCount VF : Range)
9647     Plan->addVF(VF);
9648 
9649   VPlanTransforms::VPInstructionsToVPRecipes(
9650       Plan,
9651       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9652       *PSE.getSE(), *TLI);
9653 
9654   // Remove the existing terminator of the exiting block of the top-most region.
9655   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9656   auto *Term =
9657       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9658   Term->eraseFromParent();
9659 
9660   // Tail folding is not supported for outer loops, so the induction increment
9661   // is guaranteed to not wrap.
9662   bool HasNUW = true;
9663   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9664                         DebugLoc());
9665 
9666   // Collect mapping of IR header phis to header phi recipes, to be used in
9667   // addScalarResumePhis.
9668   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9669                                 Builder);
9670   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9671     if (isa<VPCanonicalIVPHIRecipe>(&R))
9672       continue;
9673     auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9674     RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9675   }
9676   addScalarResumePhis(RecipeBuilder, *Plan);
9677 
9678   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9679   return Plan;
9680 }
9681 
9682 // Adjust the recipes for reductions. For in-loop reductions the chain of
9683 // instructions leading from the loop exit instr to the phi need to be converted
9684 // to reductions, with one operand being vector and the other being the scalar
9685 // reduction chain. For other reductions, a select is introduced between the phi
9686 // and users outside the vector region when folding the tail.
9687 //
9688 // A ComputeReductionResult recipe is added to the middle block, also for
9689 // in-loop reductions which compute their result in-loop, because generating
9690 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9691 //
9692 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9693 // with a boolean reduction phi node to check if the condition is true in any
9694 // iteration. The final value is selected by the final ComputeReductionResult.
9695 void LoopVectorizationPlanner::adjustRecipesForReductions(
9696     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9697   using namespace VPlanPatternMatch;
9698   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9699   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9700   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9701   for (VPRecipeBase &R : Header->phis()) {
9702     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9703     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9704       continue;
9705 
9706     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9707     RecurKind Kind = RdxDesc.getRecurrenceKind();
9708     assert(
9709         !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9710         !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
9711         "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9712 
9713     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9714     SetVector<VPSingleDefRecipe *> Worklist;
9715     Worklist.insert(PhiR);
9716     for (unsigned I = 0; I != Worklist.size(); ++I) {
9717       VPSingleDefRecipe *Cur = Worklist[I];
9718       for (VPUser *U : Cur->users()) {
9719         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9720         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9721           assert((UserRecipe->getParent() == MiddleVPBB ||
9722                   UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9723                  "U must be either in the loop region, the middle block or the "
9724                  "scalar preheader.");
9725           continue;
9726         }
9727         Worklist.insert(UserRecipe);
9728       }
9729     }
9730 
9731     // Visit operation "Links" along the reduction chain top-down starting from
9732     // the phi until LoopExitValue. We keep track of the previous item
9733     // (PreviousLink) to tell which of the two operands of a Link will remain
9734     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9735     // the select instructions. Blend recipes of in-loop reduction phi's  will
9736     // get folded to their non-phi operand, as the reduction recipe handles the
9737     // condition directly.
9738     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9739     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9740       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9741 
9742       // Index of the first operand which holds a non-mask vector operand.
9743       unsigned IndexOfFirstOperand;
9744       // Recognize a call to the llvm.fmuladd intrinsic.
9745       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9746       VPValue *VecOp;
9747       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9748       if (IsFMulAdd) {
9749         assert(
9750             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9751             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9752         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9753                 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9754                CurrentLink->getOperand(2) == PreviousLink &&
9755                "expected a call where the previous link is the added operand");
9756 
9757         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9758         // need to create an fmul recipe (multiplying the first two operands of
9759         // the fmuladd together) to use as the vector operand for the fadd
9760         // reduction.
9761         VPInstruction *FMulRecipe = new VPInstruction(
9762             Instruction::FMul,
9763             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9764             CurrentLinkI->getFastMathFlags());
9765         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9766         VecOp = FMulRecipe;
9767       } else {
9768         auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9769         if (PhiR->isInLoop() && Blend) {
9770           assert(Blend->getNumIncomingValues() == 2 &&
9771                  "Blend must have 2 incoming values");
9772           if (Blend->getIncomingValue(0) == PhiR)
9773             Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9774           else {
9775             assert(Blend->getIncomingValue(1) == PhiR &&
9776                    "PhiR must be an operand of the blend");
9777             Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9778           }
9779           continue;
9780         }
9781 
9782         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9783           if (isa<VPWidenRecipe>(CurrentLink)) {
9784             assert(isa<CmpInst>(CurrentLinkI) &&
9785                    "need to have the compare of the select");
9786             continue;
9787           }
9788           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9789                  "must be a select recipe");
9790           IndexOfFirstOperand = 1;
9791         } else {
9792           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9793                  "Expected to replace a VPWidenSC");
9794           IndexOfFirstOperand = 0;
9795         }
9796         // Note that for non-commutable operands (cmp-selects), the semantics of
9797         // the cmp-select are captured in the recurrence kind.
9798         unsigned VecOpId =
9799             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9800                 ? IndexOfFirstOperand + 1
9801                 : IndexOfFirstOperand;
9802         VecOp = CurrentLink->getOperand(VecOpId);
9803         assert(VecOp != PreviousLink &&
9804                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9805                                        (VecOpId - IndexOfFirstOperand)) ==
9806                    PreviousLink &&
9807                "PreviousLink must be the operand other than VecOp");
9808       }
9809 
9810       BasicBlock *BB = CurrentLinkI->getParent();
9811       VPValue *CondOp = nullptr;
9812       if (CM.blockNeedsPredicationForAnyReason(BB))
9813         CondOp = RecipeBuilder.getBlockInMask(BB);
9814 
9815       auto *RedRecipe = new VPReductionRecipe(
9816           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9817           CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9818       // Append the recipe to the end of the VPBasicBlock because we need to
9819       // ensure that it comes after all of it's inputs, including CondOp.
9820       // Note that this transformation may leave over dead recipes (including
9821       // CurrentLink), which will be cleaned by a later VPlan transform.
9822       LinkVPBB->appendRecipe(RedRecipe);
9823       CurrentLink->replaceAllUsesWith(RedRecipe);
9824       PreviousLink = RedRecipe;
9825     }
9826   }
9827   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9828   Builder.setInsertPoint(&*LatchVPBB->begin());
9829   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9830   for (VPRecipeBase &R :
9831        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9832     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9833     if (!PhiR)
9834       continue;
9835 
9836     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9837     // If tail is folded by masking, introduce selects between the phi
9838     // and the users outside the vector region of each reduction, at the
9839     // beginning of the dedicated latch block.
9840     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9841     auto *NewExitingVPV = PhiR->getBackedgeValue();
9842     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9843       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9844       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9845              "reduction recipe must be defined before latch");
9846       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9847       std::optional<FastMathFlags> FMFs =
9848           PhiTy->isFloatingPointTy()
9849               ? std::make_optional(RdxDesc.getFastMathFlags())
9850               : std::nullopt;
9851       NewExitingVPV =
9852           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9853       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9854         return isa<VPInstruction>(&U) &&
9855                cast<VPInstruction>(&U)->getOpcode() ==
9856                    VPInstruction::ComputeReductionResult;
9857       });
9858       if (CM.usePredicatedReductionSelect(
9859               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9860         PhiR->setOperand(1, NewExitingVPV);
9861     }
9862 
9863     // If the vector reduction can be performed in a smaller type, we truncate
9864     // then extend the loop exit value to enable InstCombine to evaluate the
9865     // entire expression in the smaller type.
9866     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9867     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9868         !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9869             RdxDesc.getRecurrenceKind())) {
9870       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9871       Type *RdxTy = RdxDesc.getRecurrenceType();
9872       auto *Trunc =
9873           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9874       auto *Extnd =
9875           RdxDesc.isSigned()
9876               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9877               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9878 
9879       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9880       Extnd->insertAfter(Trunc);
9881       if (PhiR->getOperand(1) == NewExitingVPV)
9882         PhiR->setOperand(1, Extnd->getVPSingleValue());
9883       NewExitingVPV = Extnd;
9884     }
9885 
9886     // We want code in the middle block to appear to execute on the location of
9887     // the scalar loop's latch terminator because: (a) it is all compiler
9888     // generated, (b) these instructions are always executed after evaluating
9889     // the latch conditional branch, and (c) other passes may add new
9890     // predecessors which terminate on this line. This is the easiest way to
9891     // ensure we don't accidentally cause an extra step back into the loop while
9892     // debugging.
9893     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9894 
9895     // TODO: At the moment ComputeReductionResult also drives creation of the
9896     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9897     // even for in-loop reductions, until the reduction resume value handling is
9898     // also modeled in VPlan.
9899     auto *FinalReductionResult = new VPInstruction(
9900         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9901     // Update all users outside the vector region.
9902     OrigExitingVPV->replaceUsesWithIf(
9903         FinalReductionResult, [](VPUser &User, unsigned) {
9904           auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9905           return Parent && !Parent->getParent();
9906         });
9907     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9908 
9909     // Adjust AnyOf reductions; replace the reduction phi for the selected value
9910     // with a boolean reduction phi node to check if the condition is true in
9911     // any iteration. The final value is selected by the final
9912     // ComputeReductionResult.
9913     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9914             RdxDesc.getRecurrenceKind())) {
9915       auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9916         return isa<VPWidenSelectRecipe>(U) ||
9917                (isa<VPReplicateRecipe>(U) &&
9918                 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9919                     Instruction::Select);
9920       }));
9921       VPValue *Cmp = Select->getOperand(0);
9922       // If the compare is checking the reduction PHI node, adjust it to check
9923       // the start value.
9924       if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9925         for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9926           if (CmpR->getOperand(I) == PhiR)
9927             CmpR->setOperand(I, PhiR->getStartValue());
9928       }
9929       VPBuilder::InsertPointGuard Guard(Builder);
9930       Builder.setInsertPoint(Select);
9931 
9932       // If the true value of the select is the reduction phi, the new value is
9933       // selected if the negated condition is true in any iteration.
9934       if (Select->getOperand(1) == PhiR)
9935         Cmp = Builder.createNot(Cmp);
9936       VPValue *Or = Builder.createOr(PhiR, Cmp);
9937       Select->getVPSingleValue()->replaceAllUsesWith(Or);
9938 
9939       // Convert the reduction phi to operate on bools.
9940       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9941                               OrigLoop->getHeader()->getContext())));
9942       continue;
9943     }
9944 
9945     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
9946             RdxDesc.getRecurrenceKind())) {
9947       // Adjust the start value for FindLastIV recurrences to use the sentinel
9948       // value after generating the ResumePhi recipe, which uses the original
9949       // start value.
9950       PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9951     }
9952   }
9953 
9954   VPlanTransforms::clearReductionWrapFlags(*Plan);
9955 }
9956 
9957 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9958   assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9959 
9960   // Fast-math-flags propagate from the original induction instruction.
9961   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9962   if (FPBinOp)
9963     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9964 
9965   Value *Step = State.get(getStepValue(), VPLane(0));
9966   Value *Index = State.get(getOperand(1), VPLane(0));
9967   Value *DerivedIV = emitTransformedIndex(
9968       State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9969       cast_if_present<BinaryOperator>(FPBinOp));
9970   DerivedIV->setName(Name);
9971   // If index is the vector trip count, the concrete value will only be set in
9972   // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9973   // TODO: Remove the special case for the vector trip count once it is computed
9974   // in VPlan and can be used during VPlan simplification.
9975   assert((DerivedIV != Index ||
9976           getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9977          "IV didn't need transforming?");
9978   State.set(this, DerivedIV, VPLane(0));
9979 }
9980 
9981 void VPReplicateRecipe::execute(VPTransformState &State) {
9982   Instruction *UI = getUnderlyingInstr();
9983   if (State.Lane) { // Generate a single instance.
9984     assert((State.VF.isScalar() || !isUniform()) &&
9985            "uniform recipe shouldn't be predicated");
9986     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9987     State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
9988     // Insert scalar instance packing it into a vector.
9989     if (State.VF.isVector() && shouldPack()) {
9990       // If we're constructing lane 0, initialize to start from poison.
9991       if (State.Lane->isFirstLane()) {
9992         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9993         Value *Poison = PoisonValue::get(
9994             VectorType::get(UI->getType(), State.VF));
9995         State.set(this, Poison);
9996       }
9997       State.packScalarIntoVectorValue(this, *State.Lane);
9998     }
9999     return;
10000   }
10001 
10002   if (IsUniform) {
10003     // Uniform within VL means we need to generate lane 0.
10004     State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
10005     return;
10006   }
10007 
10008   // A store of a loop varying value to a uniform address only needs the last
10009   // copy of the store.
10010   if (isa<StoreInst>(UI) &&
10011       vputils::isUniformAfterVectorization(getOperand(1))) {
10012     auto Lane = VPLane::getLastLaneForVF(State.VF);
10013     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10014     return;
10015   }
10016 
10017   // Generate scalar instances for all VF lanes.
10018   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
10019   const unsigned EndLane = State.VF.getKnownMinValue();
10020   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
10021     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10022 }
10023 
10024 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10025 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10026 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10027 // for predication.
10028 static ScalarEpilogueLowering getScalarEpilogueLowering(
10029     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10030     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10031     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
10032   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10033   // don't look at hints or options, and don't request a scalar epilogue.
10034   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10035   // LoopAccessInfo (due to code dependency and not being able to reliably get
10036   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10037   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10038   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10039   // back to the old way and vectorize with versioning when forced. See D81345.)
10040   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10041                                                       PGSOQueryType::IRPass) &&
10042                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10043     return CM_ScalarEpilogueNotAllowedOptSize;
10044 
10045   // 2) If set, obey the directives
10046   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10047     switch (PreferPredicateOverEpilogue) {
10048     case PreferPredicateTy::ScalarEpilogue:
10049       return CM_ScalarEpilogueAllowed;
10050     case PreferPredicateTy::PredicateElseScalarEpilogue:
10051       return CM_ScalarEpilogueNotNeededUsePredicate;
10052     case PreferPredicateTy::PredicateOrDontVectorize:
10053       return CM_ScalarEpilogueNotAllowedUsePredicate;
10054     };
10055   }
10056 
10057   // 3) If set, obey the hints
10058   switch (Hints.getPredicate()) {
10059   case LoopVectorizeHints::FK_Enabled:
10060     return CM_ScalarEpilogueNotNeededUsePredicate;
10061   case LoopVectorizeHints::FK_Disabled:
10062     return CM_ScalarEpilogueAllowed;
10063   };
10064 
10065   // 4) if the TTI hook indicates this is profitable, request predication.
10066   TailFoldingInfo TFI(TLI, &LVL, IAI);
10067   if (TTI->preferPredicateOverEpilogue(&TFI))
10068     return CM_ScalarEpilogueNotNeededUsePredicate;
10069 
10070   return CM_ScalarEpilogueAllowed;
10071 }
10072 
10073 // Process the loop in the VPlan-native vectorization path. This path builds
10074 // VPlan upfront in the vectorization pipeline, which allows to apply
10075 // VPlan-to-VPlan transformations from the very beginning without modifying the
10076 // input LLVM IR.
10077 static bool processLoopInVPlanNativePath(
10078     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10079     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10080     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10081     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10082     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10083     LoopVectorizationRequirements &Requirements) {
10084 
10085   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10086     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10087     return false;
10088   }
10089   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10090   Function *F = L->getHeader()->getParent();
10091   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10092 
10093   ScalarEpilogueLowering SEL =
10094       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
10095 
10096   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10097                                 &Hints, IAI);
10098   // Use the planner for outer loop vectorization.
10099   // TODO: CM is not used at this point inside the planner. Turn CM into an
10100   // optional argument if we don't need it in the future.
10101   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
10102                                ORE);
10103 
10104   // Get user vectorization factor.
10105   ElementCount UserVF = Hints.getWidth();
10106 
10107   CM.collectElementTypesForWidening();
10108 
10109   // Plan how to best vectorize, return the best VF and its cost.
10110   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10111 
10112   // If we are stress testing VPlan builds, do not attempt to generate vector
10113   // code. Masked vector code generation support will follow soon.
10114   // Also, do not attempt to vectorize if no vector code will be produced.
10115   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
10116     return false;
10117 
10118   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10119 
10120   {
10121     bool AddBranchWeights =
10122         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10123     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10124                              AddBranchWeights);
10125     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10126                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10127     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10128                       << L->getHeader()->getParent()->getName() << "\"\n");
10129     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10130   }
10131 
10132   reportVectorization(ORE, L, VF, 1);
10133 
10134   // Mark the loop as already vectorized to avoid vectorizing again.
10135   Hints.setAlreadyVectorized();
10136   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10137   return true;
10138 }
10139 
10140 // Emit a remark if there are stores to floats that required a floating point
10141 // extension. If the vectorized loop was generated with floating point there
10142 // will be a performance penalty from the conversion overhead and the change in
10143 // the vector width.
10144 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10145   SmallVector<Instruction *, 4> Worklist;
10146   for (BasicBlock *BB : L->getBlocks()) {
10147     for (Instruction &Inst : *BB) {
10148       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10149         if (S->getValueOperand()->getType()->isFloatTy())
10150           Worklist.push_back(S);
10151       }
10152     }
10153   }
10154 
10155   // Traverse the floating point stores upwards searching, for floating point
10156   // conversions.
10157   SmallPtrSet<const Instruction *, 4> Visited;
10158   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10159   while (!Worklist.empty()) {
10160     auto *I = Worklist.pop_back_val();
10161     if (!L->contains(I))
10162       continue;
10163     if (!Visited.insert(I).second)
10164       continue;
10165 
10166     // Emit a remark if the floating point store required a floating
10167     // point conversion.
10168     // TODO: More work could be done to identify the root cause such as a
10169     // constant or a function return type and point the user to it.
10170     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10171       ORE->emit([&]() {
10172         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10173                                           I->getDebugLoc(), L->getHeader())
10174                << "floating point conversion changes vector width. "
10175                << "Mixed floating point precision requires an up/down "
10176                << "cast that will negatively impact performance.";
10177       });
10178 
10179     for (Use &Op : I->operands())
10180       if (auto *OpI = dyn_cast<Instruction>(Op))
10181         Worklist.push_back(OpI);
10182   }
10183 }
10184 
10185 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10186                                        VectorizationFactor &VF, Loop *L,
10187                                        const TargetTransformInfo &TTI,
10188                                        PredicatedScalarEvolution &PSE,
10189                                        ScalarEpilogueLowering SEL) {
10190   InstructionCost CheckCost = Checks.getCost();
10191   if (!CheckCost.isValid())
10192     return false;
10193 
10194   // When interleaving only scalar and vector cost will be equal, which in turn
10195   // would lead to a divide by 0. Fall back to hard threshold.
10196   if (VF.Width.isScalar()) {
10197     if (CheckCost > VectorizeMemoryCheckThreshold) {
10198       LLVM_DEBUG(
10199           dbgs()
10200           << "LV: Interleaving only is not profitable due to runtime checks\n");
10201       return false;
10202     }
10203     return true;
10204   }
10205 
10206   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10207   uint64_t ScalarC = *VF.ScalarCost.getValue();
10208   if (ScalarC == 0)
10209     return true;
10210 
10211   // First, compute the minimum iteration count required so that the vector
10212   // loop outperforms the scalar loop.
10213   //  The total cost of the scalar loop is
10214   //   ScalarC * TC
10215   //  where
10216   //  * TC is the actual trip count of the loop.
10217   //  * ScalarC is the cost of a single scalar iteration.
10218   //
10219   //  The total cost of the vector loop is
10220   //    RtC + VecC * (TC / VF) + EpiC
10221   //  where
10222   //  * RtC is the cost of the generated runtime checks
10223   //  * VecC is the cost of a single vector iteration.
10224   //  * TC is the actual trip count of the loop
10225   //  * VF is the vectorization factor
10226   //  * EpiCost is the cost of the generated epilogue, including the cost
10227   //    of the remaining scalar operations.
10228   //
10229   // Vectorization is profitable once the total vector cost is less than the
10230   // total scalar cost:
10231   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10232   //
10233   // Now we can compute the minimum required trip count TC as
10234   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10235   //
10236   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10237   // the computations are performed on doubles, not integers and the result
10238   // is rounded up, hence we get an upper estimate of the TC.
10239   unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10240   uint64_t RtC = *CheckCost.getValue();
10241   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10242   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10243 
10244   // Second, compute a minimum iteration count so that the cost of the
10245   // runtime checks is only a fraction of the total scalar loop cost. This
10246   // adds a loop-dependent bound on the overhead incurred if the runtime
10247   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10248   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10249   // cost, compute
10250   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10251   uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10252 
10253   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10254   // epilogue is allowed, choose the next closest multiple of VF. This should
10255   // partly compensate for ignoring the epilogue cost.
10256   uint64_t MinTC = std::max(MinTC1, MinTC2);
10257   if (SEL == CM_ScalarEpilogueAllowed)
10258     MinTC = alignTo(MinTC, IntVF);
10259   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
10260 
10261   LLVM_DEBUG(
10262       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10263              << VF.MinProfitableTripCount << "\n");
10264 
10265   // Skip vectorization if the expected trip count is less than the minimum
10266   // required trip count.
10267   if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10268     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10269                                 VF.MinProfitableTripCount)) {
10270       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10271                            "trip count < minimum profitable VF ("
10272                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10273                         << ")\n");
10274 
10275       return false;
10276     }
10277   }
10278   return true;
10279 }
10280 
10281 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10282     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10283                                !EnableLoopInterleaving),
10284       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10285                               !EnableLoopVectorization) {}
10286 
10287 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10288 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10289 /// don't have a corresponding wide induction in \p EpiPlan.
10290 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10291   // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10292   // will need their resume-values computed in the main vector loop. Others
10293   // can be removed from the main VPlan.
10294   SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10295   for (VPRecipeBase &R :
10296        EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
10297     if (isa<VPCanonicalIVPHIRecipe>(&R))
10298       continue;
10299     EpiWidenedPhis.insert(
10300         cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10301   }
10302   for (VPRecipeBase &R : make_early_inc_range(
10303            *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10304     auto *VPIRInst = cast<VPIRInstruction>(&R);
10305     auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10306     if (!IRI)
10307       break;
10308     if (EpiWidenedPhis.contains(IRI))
10309       continue;
10310     // There is no corresponding wide induction in the epilogue plan that would
10311     // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10312     // together with the corresponding ResumePhi. The resume values for the
10313     // scalar loop will be created during execution of EpiPlan.
10314     VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10315     VPIRInst->eraseFromParent();
10316     ResumePhi->eraseFromParent();
10317   }
10318   VPlanTransforms::removeDeadRecipes(MainPlan);
10319 
10320   using namespace VPlanPatternMatch;
10321   VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10322   VPValue *VectorTC = &MainPlan.getVectorTripCount();
10323   // If there is a suitable resume value for the canonical induction in the
10324   // scalar (which will become vector) epilogue loop we are done. Otherwise
10325   // create it below.
10326   if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10327         return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10328                              m_Specific(VectorTC), m_SpecificInt(0)));
10329       }))
10330     return;
10331   VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10332   ScalarPHBuilder.createNaryOp(
10333       VPInstruction::ResumePhi,
10334       {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10335       "vec.epilog.resume.val");
10336 }
10337 
10338 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10339 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10340 static void
10341 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
10342                                  const SCEV2ValueTy &ExpandedSCEVs,
10343                                  const EpilogueLoopVectorizationInfo &EPI) {
10344   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10345   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10346   Header->setName("vec.epilog.vector.body");
10347 
10348   // Re-use the trip count and steps expanded for the main loop, as
10349   // skeleton creation needs it as a value that dominates both the scalar
10350   // and vector epilogue loops
10351   // TODO: This is a workaround needed for epilogue vectorization and it
10352   // should be removed once induction resume value creation is done
10353   // directly in VPlan.
10354   for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10355     auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10356     if (!ExpandR)
10357       continue;
10358     auto *ExpandedVal =
10359         Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10360     ExpandR->replaceAllUsesWith(ExpandedVal);
10361     if (Plan.getTripCount() == ExpandR)
10362       Plan.resetTripCount(ExpandedVal);
10363     ExpandR->eraseFromParent();
10364   }
10365 
10366   // Ensure that the start values for all header phi recipes are updated before
10367   // vectorizing the epilogue loop.
10368   for (VPRecipeBase &R : Header->phis()) {
10369     if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10370       // When vectorizing the epilogue loop, the canonical induction start
10371       // value needs to be changed from zero to the value after the main
10372       // vector loop. Find the resume value created during execution of the main
10373       // VPlan.
10374       // FIXME: Improve modeling for canonical IV start values in the epilogue
10375       // loop.
10376       BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10377           predecessors(L->getLoopPreheader()),
10378           [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10379             if (BB != EPI.MainLoopIterationCountCheck &&
10380                 BB != EPI.EpilogueIterationCountCheck &&
10381                 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10382               return BB;
10383             return nullptr;
10384           });
10385       using namespace llvm::PatternMatch;
10386       Type *IdxTy = IV->getScalarType();
10387       PHINode *EPResumeVal = find_singleton<PHINode>(
10388           L->getLoopPreheader()->phis(),
10389           [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10390             if (P.getType() == IdxTy &&
10391                 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10392                 match(
10393                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10394                     m_SpecificInt(0)))
10395               return &P;
10396             return nullptr;
10397           });
10398       assert(EPResumeVal && "must have a resume value for the canonical IV");
10399       VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10400       assert(all_of(IV->users(),
10401                     [](const VPUser *U) {
10402                       return isa<VPScalarIVStepsRecipe>(U) ||
10403                              isa<VPScalarCastRecipe>(U) ||
10404                              isa<VPDerivedIVRecipe>(U) ||
10405                              cast<VPInstruction>(U)->getOpcode() ==
10406                                  Instruction::Add;
10407                     }) &&
10408              "the canonical IV should only be used by its increment or "
10409              "ScalarIVSteps when resetting the start value");
10410       IV->setOperand(0, VPV);
10411       continue;
10412     }
10413 
10414     Value *ResumeV = nullptr;
10415     // TODO: Move setting of resume values to prepareToExecute.
10416     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10417       ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10418                     ->getIncomingValueForBlock(L->getLoopPreheader());
10419       const RecurrenceDescriptor &RdxDesc =
10420           ReductionPhi->getRecurrenceDescriptor();
10421       RecurKind RK = RdxDesc.getRecurrenceKind();
10422       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
10423         // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10424         // start value; compare the final value from the main vector loop
10425         // to the start value.
10426         IRBuilder<> Builder(
10427             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10428         ResumeV =
10429             Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10430       } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
10431         // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10432         // to the resume value. The resume value is adjusted to the sentinel
10433         // value when the final value from the main vector loop equals the start
10434         // value. This ensures correctness when the start value might not be
10435         // less than the minimum value of a monotonically increasing induction
10436         // variable.
10437         IRBuilder<> Builder(
10438             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10439         Value *Cmp =
10440             Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10441         ResumeV =
10442             Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10443       }
10444     } else {
10445       // Retrieve the induction resume values for wide inductions from
10446       // their original phi nodes in the scalar loop.
10447       PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10448       // Hook up to the PHINode generated by a ResumePhi recipe of main
10449       // loop VPlan, which feeds the scalar loop.
10450       ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10451     }
10452     assert(ResumeV && "Must have a resume value");
10453     VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10454     cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10455   }
10456 }
10457 
10458 bool LoopVectorizePass::processLoop(Loop *L) {
10459   assert((EnableVPlanNativePath || L->isInnermost()) &&
10460          "VPlan-native path is not enabled. Only process inner loops.");
10461 
10462   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10463                     << L->getHeader()->getParent()->getName() << "' from "
10464                     << L->getLocStr() << "\n");
10465 
10466   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10467 
10468   LLVM_DEBUG(
10469       dbgs() << "LV: Loop hints:"
10470              << " force="
10471              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10472                      ? "disabled"
10473                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10474                             ? "enabled"
10475                             : "?"))
10476              << " width=" << Hints.getWidth()
10477              << " interleave=" << Hints.getInterleave() << "\n");
10478 
10479   // Function containing loop
10480   Function *F = L->getHeader()->getParent();
10481 
10482   // Looking at the diagnostic output is the only way to determine if a loop
10483   // was vectorized (other than looking at the IR or machine code), so it
10484   // is important to generate an optimization remark for each loop. Most of
10485   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10486   // generated as OptimizationRemark and OptimizationRemarkMissed are
10487   // less verbose reporting vectorized loops and unvectorized loops that may
10488   // benefit from vectorization, respectively.
10489 
10490   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10491     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10492     return false;
10493   }
10494 
10495   PredicatedScalarEvolution PSE(*SE, *L);
10496 
10497   // Check if it is legal to vectorize the loop.
10498   LoopVectorizationRequirements Requirements;
10499   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10500                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10501   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10502     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10503     Hints.emitRemarkWithHints();
10504     return false;
10505   }
10506 
10507   if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10508     reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10509                                "early exit is not enabled",
10510                                "UncountableEarlyExitLoopsDisabled", ORE, L);
10511     return false;
10512   }
10513 
10514   if (LVL.hasStructVectorCall()) {
10515     reportVectorizationFailure("Auto-vectorization of calls that return struct "
10516                                "types is not yet supported",
10517                                "StructCallVectorizationUnsupported", ORE, L);
10518     return false;
10519   }
10520 
10521   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10522   // here. They may require CFG and instruction level transformations before
10523   // even evaluating whether vectorization is profitable. Since we cannot modify
10524   // the incoming IR, we need to build VPlan upfront in the vectorization
10525   // pipeline.
10526   if (!L->isInnermost())
10527     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10528                                         ORE, BFI, PSI, Hints, Requirements);
10529 
10530   assert(L->isInnermost() && "Inner loop expected.");
10531 
10532   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10533   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10534 
10535   // If an override option has been passed in for interleaved accesses, use it.
10536   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10537     UseInterleaved = EnableInterleavedMemAccesses;
10538 
10539   // Analyze interleaved memory accesses.
10540   if (UseInterleaved)
10541     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10542 
10543   if (LVL.hasUncountableEarlyExit()) {
10544     BasicBlock *LoopLatch = L->getLoopLatch();
10545     if (IAI.requiresScalarEpilogue() ||
10546         any_of(LVL.getCountableExitingBlocks(),
10547                [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10548       reportVectorizationFailure("Auto-vectorization of early exit loops "
10549                                  "requiring a scalar epilogue is unsupported",
10550                                  "UncountableEarlyExitUnsupported", ORE, L);
10551       return false;
10552     }
10553   }
10554 
10555   // Check the function attributes and profiles to find out if this function
10556   // should be optimized for size.
10557   ScalarEpilogueLowering SEL =
10558       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10559 
10560   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10561   // count by optimizing for size, to minimize overheads.
10562   auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10563   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10564     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10565                       << "This loop is worth vectorizing only if no scalar "
10566                       << "iteration overheads are incurred.");
10567     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10568       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10569     else {
10570       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10571         LLVM_DEBUG(dbgs() << "\n");
10572         // Predicate tail-folded loops are efficient even when the loop
10573         // iteration count is low. However, setting the epilogue policy to
10574         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10575         // with runtime checks. It's more effective to let
10576         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10577         // for the loop.
10578         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10579           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10580       } else {
10581         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10582                              "small to consider vectorizing.\n");
10583         reportVectorizationFailure(
10584             "The trip count is below the minial threshold value.",
10585             "loop trip count is too low, avoiding vectorization",
10586             "LowTripCount", ORE, L);
10587         Hints.emitRemarkWithHints();
10588         return false;
10589       }
10590     }
10591   }
10592 
10593   // Check the function attributes to see if implicit floats or vectors are
10594   // allowed.
10595   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10596     reportVectorizationFailure(
10597         "Can't vectorize when the NoImplicitFloat attribute is used",
10598         "loop not vectorized due to NoImplicitFloat attribute",
10599         "NoImplicitFloat", ORE, L);
10600     Hints.emitRemarkWithHints();
10601     return false;
10602   }
10603 
10604   // Check if the target supports potentially unsafe FP vectorization.
10605   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10606   // for the target we're vectorizing for, to make sure none of the
10607   // additional fp-math flags can help.
10608   if (Hints.isPotentiallyUnsafe() &&
10609       TTI->isFPVectorizationPotentiallyUnsafe()) {
10610     reportVectorizationFailure(
10611         "Potentially unsafe FP op prevents vectorization",
10612         "loop not vectorized due to unsafe FP support.",
10613         "UnsafeFP", ORE, L);
10614     Hints.emitRemarkWithHints();
10615     return false;
10616   }
10617 
10618   bool AllowOrderedReductions;
10619   // If the flag is set, use that instead and override the TTI behaviour.
10620   if (ForceOrderedReductions.getNumOccurrences() > 0)
10621     AllowOrderedReductions = ForceOrderedReductions;
10622   else
10623     AllowOrderedReductions = TTI->enableOrderedReductions();
10624   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10625     ORE->emit([&]() {
10626       auto *ExactFPMathInst = Requirements.getExactFPInst();
10627       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10628                                                  ExactFPMathInst->getDebugLoc(),
10629                                                  ExactFPMathInst->getParent())
10630              << "loop not vectorized: cannot prove it is safe to reorder "
10631                 "floating-point operations";
10632     });
10633     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10634                          "reorder floating-point operations\n");
10635     Hints.emitRemarkWithHints();
10636     return false;
10637   }
10638 
10639   // Use the cost model.
10640   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10641                                 F, &Hints, IAI);
10642   // Use the planner for vectorization.
10643   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10644                                ORE);
10645 
10646   // Get user vectorization factor and interleave count.
10647   ElementCount UserVF = Hints.getWidth();
10648   unsigned UserIC = Hints.getInterleave();
10649 
10650   // Plan how to best vectorize.
10651   LVP.plan(UserVF, UserIC);
10652   VectorizationFactor VF = LVP.computeBestVF();
10653   unsigned IC = 1;
10654 
10655   if (ORE->allowExtraAnalysis(LV_NAME))
10656     LVP.emitInvalidCostRemarks(ORE);
10657 
10658   bool AddBranchWeights =
10659       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10660   GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10661                            AddBranchWeights);
10662   if (LVP.hasPlanWithVF(VF.Width)) {
10663     // Select the interleave count.
10664     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10665 
10666     unsigned SelectedIC = std::max(IC, UserIC);
10667     //  Optimistically generate runtime checks if they are needed. Drop them if
10668     //  they turn out to not be profitable.
10669     if (VF.Width.isVector() || SelectedIC > 1)
10670       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10671 
10672     // Check if it is profitable to vectorize with runtime checks.
10673     bool ForceVectorization =
10674         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10675     if (!ForceVectorization &&
10676         !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10677       ORE->emit([&]() {
10678         return OptimizationRemarkAnalysisAliasing(
10679                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10680                    L->getHeader())
10681                << "loop not vectorized: cannot prove it is safe to reorder "
10682                   "memory operations";
10683       });
10684       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10685       Hints.emitRemarkWithHints();
10686       return false;
10687     }
10688   }
10689 
10690   // Identify the diagnostic messages that should be produced.
10691   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10692   bool VectorizeLoop = true, InterleaveLoop = true;
10693   if (VF.Width.isScalar()) {
10694     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10695     VecDiagMsg = std::make_pair(
10696         "VectorizationNotBeneficial",
10697         "the cost-model indicates that vectorization is not beneficial");
10698     VectorizeLoop = false;
10699   }
10700 
10701   if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10702     // Tell the user interleaving was avoided up-front, despite being explicitly
10703     // requested.
10704     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10705                          "interleaving should be avoided up front\n");
10706     IntDiagMsg = std::make_pair(
10707         "InterleavingAvoided",
10708         "Ignoring UserIC, because interleaving was avoided up front");
10709     InterleaveLoop = false;
10710   } else if (IC == 1 && UserIC <= 1) {
10711     // Tell the user interleaving is not beneficial.
10712     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10713     IntDiagMsg = std::make_pair(
10714         "InterleavingNotBeneficial",
10715         "the cost-model indicates that interleaving is not beneficial");
10716     InterleaveLoop = false;
10717     if (UserIC == 1) {
10718       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10719       IntDiagMsg.second +=
10720           " and is explicitly disabled or interleave count is set to 1";
10721     }
10722   } else if (IC > 1 && UserIC == 1) {
10723     // Tell the user interleaving is beneficial, but it explicitly disabled.
10724     LLVM_DEBUG(
10725         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10726     IntDiagMsg = std::make_pair(
10727         "InterleavingBeneficialButDisabled",
10728         "the cost-model indicates that interleaving is beneficial "
10729         "but is explicitly disabled or interleave count is set to 1");
10730     InterleaveLoop = false;
10731   }
10732 
10733   // If there is a histogram in the loop, do not just interleave without
10734   // vectorizing. The order of operations will be incorrect without the
10735   // histogram intrinsics, which are only used for recipes with VF > 1.
10736   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10737     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10738                       << "to histogram operations.\n");
10739     IntDiagMsg = std::make_pair(
10740         "HistogramPreventsScalarInterleaving",
10741         "Unable to interleave without vectorization due to constraints on "
10742         "the order of histogram operations");
10743     InterleaveLoop = false;
10744   }
10745 
10746   // Override IC if user provided an interleave count.
10747   IC = UserIC > 0 ? UserIC : IC;
10748 
10749   // Emit diagnostic messages, if any.
10750   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10751   if (!VectorizeLoop && !InterleaveLoop) {
10752     // Do not vectorize or interleaving the loop.
10753     ORE->emit([&]() {
10754       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10755                                       L->getStartLoc(), L->getHeader())
10756              << VecDiagMsg.second;
10757     });
10758     ORE->emit([&]() {
10759       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10760                                       L->getStartLoc(), L->getHeader())
10761              << IntDiagMsg.second;
10762     });
10763     return false;
10764   }
10765 
10766   if (!VectorizeLoop && InterleaveLoop) {
10767     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10768     ORE->emit([&]() {
10769       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10770                                         L->getStartLoc(), L->getHeader())
10771              << VecDiagMsg.second;
10772     });
10773   } else if (VectorizeLoop && !InterleaveLoop) {
10774     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10775                       << ") in " << L->getLocStr() << '\n');
10776     ORE->emit([&]() {
10777       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10778                                         L->getStartLoc(), L->getHeader())
10779              << IntDiagMsg.second;
10780     });
10781   } else if (VectorizeLoop && InterleaveLoop) {
10782     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10783                       << ") in " << L->getLocStr() << '\n');
10784     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10785   }
10786 
10787   bool DisableRuntimeUnroll = false;
10788   MDNode *OrigLoopID = L->getLoopID();
10789   {
10790     using namespace ore;
10791     if (!VectorizeLoop) {
10792       assert(IC > 1 && "interleave count should not be 1 or 0");
10793       // If we decided that it is not legal to vectorize the loop, then
10794       // interleave it.
10795       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10796       InnerLoopVectorizer Unroller(
10797           L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10798           ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10799 
10800       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10801 
10802       ORE->emit([&]() {
10803         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10804                                   L->getHeader())
10805                << "interleaved loop (interleaved count: "
10806                << NV("InterleaveCount", IC) << ")";
10807       });
10808     } else {
10809       // If we decided that it is *legal* to vectorize the loop, then do it.
10810 
10811       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10812       // Consider vectorizing the epilogue too if it's profitable.
10813       VectorizationFactor EpilogueVF =
10814           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10815       if (EpilogueVF.Width.isVector()) {
10816         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10817 
10818         // The first pass vectorizes the main loop and creates a scalar epilogue
10819         // to be vectorized by executing the plan (potentially with a different
10820         // factor) again shortly afterwards.
10821         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10822         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10823         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10824                                           BestEpiPlan);
10825         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10826                                            EPI, &LVL, &CM, BFI, PSI, Checks,
10827                                            *BestMainPlan);
10828         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10829                                              *BestMainPlan, MainILV, DT, false);
10830         ++LoopsVectorized;
10831 
10832         // Second pass vectorizes the epilogue and adjusts the control flow
10833         // edges from the first pass.
10834         EPI.MainLoopVF = EPI.EpilogueVF;
10835         EPI.MainLoopUF = EPI.EpilogueUF;
10836         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10837                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10838                                                  Checks, BestEpiPlan);
10839         EpilogILV.setTripCount(MainILV.getTripCount());
10840         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10841 
10842         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10843                         DT, true, &ExpandedSCEVs);
10844         ++LoopsEpilogueVectorized;
10845 
10846         if (!MainILV.areSafetyChecksAdded())
10847           DisableRuntimeUnroll = true;
10848       } else {
10849         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10850                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10851                                PSI, Checks, BestPlan);
10852         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10853         ++LoopsVectorized;
10854 
10855         // Add metadata to disable runtime unrolling a scalar loop when there
10856         // are no runtime checks about strides and memory. A scalar loop that is
10857         // rarely used is not worth unrolling.
10858         if (!LB.areSafetyChecksAdded())
10859           DisableRuntimeUnroll = true;
10860       }
10861       // Report the vectorization decision.
10862       reportVectorization(ORE, L, VF, IC);
10863     }
10864 
10865     if (ORE->allowExtraAnalysis(LV_NAME))
10866       checkMixedPrecision(L, ORE);
10867   }
10868 
10869   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10870          "DT not preserved correctly");
10871 
10872   std::optional<MDNode *> RemainderLoopID =
10873       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10874                                       LLVMLoopVectorizeFollowupEpilogue});
10875   if (RemainderLoopID) {
10876     L->setLoopID(*RemainderLoopID);
10877   } else {
10878     if (DisableRuntimeUnroll)
10879       addRuntimeUnrollDisableMetaData(L);
10880 
10881     // Mark the loop as already vectorized to avoid vectorizing again.
10882     Hints.setAlreadyVectorized();
10883   }
10884 
10885   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10886   return true;
10887 }
10888 
10889 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10890 
10891   // Don't attempt if
10892   // 1. the target claims to have no vector registers, and
10893   // 2. interleaving won't help ILP.
10894   //
10895   // The second condition is necessary because, even if the target has no
10896   // vector registers, loop vectorization may still enable scalar
10897   // interleaving.
10898   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10899       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10900     return LoopVectorizeResult(false, false);
10901 
10902   bool Changed = false, CFGChanged = false;
10903 
10904   // The vectorizer requires loops to be in simplified form.
10905   // Since simplification may add new inner loops, it has to run before the
10906   // legality and profitability checks. This means running the loop vectorizer
10907   // will simplify all loops, regardless of whether anything end up being
10908   // vectorized.
10909   for (const auto &L : *LI)
10910     Changed |= CFGChanged |=
10911         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10912 
10913   // Build up a worklist of inner-loops to vectorize. This is necessary as
10914   // the act of vectorizing or partially unrolling a loop creates new loops
10915   // and can invalidate iterators across the loops.
10916   SmallVector<Loop *, 8> Worklist;
10917 
10918   for (Loop *L : *LI)
10919     collectSupportedLoops(*L, LI, ORE, Worklist);
10920 
10921   LoopsAnalyzed += Worklist.size();
10922 
10923   // Now walk the identified inner loops.
10924   while (!Worklist.empty()) {
10925     Loop *L = Worklist.pop_back_val();
10926 
10927     // For the inner loops we actually process, form LCSSA to simplify the
10928     // transform.
10929     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10930 
10931     Changed |= CFGChanged |= processLoop(L);
10932 
10933     if (Changed) {
10934       LAIs->clear();
10935 
10936 #ifndef NDEBUG
10937       if (VerifySCEV)
10938         SE->verify();
10939 #endif
10940     }
10941   }
10942 
10943   // Process each loop nest in the function.
10944   return LoopVectorizeResult(Changed, CFGChanged);
10945 }
10946 
10947 PreservedAnalyses LoopVectorizePass::run(Function &F,
10948                                          FunctionAnalysisManager &AM) {
10949   LI = &AM.getResult<LoopAnalysis>(F);
10950   // There are no loops in the function. Return before computing other
10951   // expensive analyses.
10952   if (LI->empty())
10953     return PreservedAnalyses::all();
10954   SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
10955   TTI = &AM.getResult<TargetIRAnalysis>(F);
10956   DT = &AM.getResult<DominatorTreeAnalysis>(F);
10957   TLI = &AM.getResult<TargetLibraryAnalysis>(F);
10958   AC = &AM.getResult<AssumptionAnalysis>(F);
10959   DB = &AM.getResult<DemandedBitsAnalysis>(F);
10960   ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10961   LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10962 
10963   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10964   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10965   BFI = nullptr;
10966   if (PSI && PSI->hasProfileSummary())
10967     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10968   LoopVectorizeResult Result = runImpl(F);
10969   if (!Result.MadeAnyChange)
10970     return PreservedAnalyses::all();
10971   PreservedAnalyses PA;
10972 
10973   if (isAssignmentTrackingEnabled(*F.getParent())) {
10974     for (auto &BB : F)
10975       RemoveRedundantDbgInstrs(&BB);
10976   }
10977 
10978   PA.preserve<LoopAnalysis>();
10979   PA.preserve<DominatorTreeAnalysis>();
10980   PA.preserve<ScalarEvolutionAnalysis>();
10981   PA.preserve<LoopAccessAnalysis>();
10982 
10983   if (Result.MadeCFGChange) {
10984     // Making CFG changes likely means a loop got vectorized. Indicate that
10985     // extra simplification passes should be run.
10986     // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10987     // be run if runtime checks have been added.
10988     AM.getResult<ShouldRunExtraVectorPasses>(F);
10989     PA.preserve<ShouldRunExtraVectorPasses>();
10990   } else {
10991     PA.preserveSet<CFGAnalyses>();
10992   }
10993   return PA;
10994 }
10995 
10996 void LoopVectorizePass::printPipeline(
10997     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10998   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10999       OS, MapClassName2PassName);
11000 
11001   OS << '<';
11002   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
11003   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
11004   OS << '>';
11005 }
11006