xref: /llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 2b55ef187cb6029eed43d7f4c0a3640c32691b31)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanPatternMatch.h"
63 #include "VPlanTransforms.h"
64 #include "VPlanUtils.h"
65 #include "VPlanVerifier.h"
66 #include "llvm/ADT/APInt.h"
67 #include "llvm/ADT/ArrayRef.h"
68 #include "llvm/ADT/DenseMap.h"
69 #include "llvm/ADT/DenseMapInfo.h"
70 #include "llvm/ADT/Hashing.h"
71 #include "llvm/ADT/MapVector.h"
72 #include "llvm/ADT/STLExtras.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/TypeSwitch.h"
79 #include "llvm/ADT/iterator_range.h"
80 #include "llvm/Analysis/AssumptionCache.h"
81 #include "llvm/Analysis/BasicAliasAnalysis.h"
82 #include "llvm/Analysis/BlockFrequencyInfo.h"
83 #include "llvm/Analysis/CFG.h"
84 #include "llvm/Analysis/CodeMetrics.h"
85 #include "llvm/Analysis/DemandedBits.h"
86 #include "llvm/Analysis/GlobalsModRef.h"
87 #include "llvm/Analysis/LoopAccessAnalysis.h"
88 #include "llvm/Analysis/LoopAnalysisManager.h"
89 #include "llvm/Analysis/LoopInfo.h"
90 #include "llvm/Analysis/LoopIterator.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/ValueTracking.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfo.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/MDBuilder.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/PatternMatch.h"
122 #include "llvm/IR/ProfDataUtils.h"
123 #include "llvm/IR/Type.h"
124 #include "llvm/IR/Use.h"
125 #include "llvm/IR/User.h"
126 #include "llvm/IR/Value.h"
127 #include "llvm/IR/Verifier.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/NativeFormatting.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/Local.h"
139 #include "llvm/Transforms/Utils/LoopSimplify.h"
140 #include "llvm/Transforms/Utils/LoopUtils.h"
141 #include "llvm/Transforms/Utils/LoopVersioning.h"
142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143 #include "llvm/Transforms/Utils/SizeOpts.h"
144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <algorithm>
146 #include <cassert>
147 #include <cstdint>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251                    "data-and-control-without-rt-check",
252                    "Similar to data-and-control, but remove the runtime check"),
253         clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254                    "Use predicated EVL instructions for tail folding. If EVL "
255                    "is unsupported, fallback to data-without-lane-mask.")));
256 
257 static cl::opt<bool> MaximizeBandwidth(
258     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259     cl::desc("Maximize bandwidth when selecting vectorization factor which "
260              "will be determined by the smallest type in loop."));
261 
262 static cl::opt<bool> EnableInterleavedMemAccesses(
263     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265 
266 /// An interleave-group may need masking if it resides in a block that needs
267 /// predication, or in order to mask away gaps.
268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
269     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271 
272 static cl::opt<unsigned> ForceTargetNumScalarRegs(
273     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's number of scalar registers."));
275 
276 static cl::opt<unsigned> ForceTargetNumVectorRegs(
277     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278     cl::desc("A flag that overrides the target's number of vector registers."));
279 
280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
281     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282     cl::desc("A flag that overrides the target's max interleave factor for "
283              "scalar loops."));
284 
285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
286     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287     cl::desc("A flag that overrides the target's max interleave factor for "
288              "vectorized loops."));
289 
290 cl::opt<unsigned> ForceTargetInstructionCost(
291     "force-target-instruction-cost", cl::init(0), cl::Hidden,
292     cl::desc("A flag that overrides the target's expected cost for "
293              "an instruction to a single constant value. Mostly "
294              "useful for getting consistent testing."));
295 
296 static cl::opt<bool> ForceTargetSupportsScalableVectors(
297     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298     cl::desc(
299         "Pretend that scalable vectors are supported, even if the target does "
300         "not support them. This flag should only be used for testing."));
301 
302 static cl::opt<unsigned> SmallLoopCost(
303     "small-loop-cost", cl::init(20), cl::Hidden,
304     cl::desc(
305         "The cost of a loop that is considered 'small' by the interleaver."));
306 
307 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
308     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309     cl::desc("Enable the use of the block frequency analysis to access PGO "
310              "heuristics minimizing code growth in cold regions and being more "
311              "aggressive in hot regions."));
312 
313 // Runtime interleave loops for load/store throughput.
314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
315     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316     cl::desc(
317         "Enable runtime interleaving until load/store ports are saturated"));
318 
319 /// The number of stores in a loop that are allowed to need predication.
320 static cl::opt<unsigned> NumberOfStoresToPredicate(
321     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322     cl::desc("Max number of stores to be predicated behind an if."));
323 
324 static cl::opt<bool> EnableIndVarRegisterHeur(
325     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326     cl::desc("Count the induction variable only once when interleaving"));
327 
328 static cl::opt<bool> EnableCondStoresVectorization(
329     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330     cl::desc("Enable if predication of stores during vectorization."));
331 
332 static cl::opt<unsigned> MaxNestedScalarReductionIC(
333     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334     cl::desc("The maximum interleave count to use when interleaving a scalar "
335              "reduction in a nested loop."));
336 
337 static cl::opt<bool>
338     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
339                            cl::Hidden,
340                            cl::desc("Prefer in-loop vector reductions, "
341                                     "overriding the targets preference."));
342 
343 static cl::opt<bool> ForceOrderedReductions(
344     "force-ordered-reductions", cl::init(false), cl::Hidden,
345     cl::desc("Enable the vectorisation of loops with in-order (strict) "
346              "FP reductions"));
347 
348 static cl::opt<bool> PreferPredicatedReductionSelect(
349     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350     cl::desc(
351         "Prefer predicating a reduction operation over an after loop select."));
352 
353 namespace llvm {
354 cl::opt<bool> EnableVPlanNativePath(
355     "enable-vplan-native-path", cl::Hidden,
356     cl::desc("Enable VPlan-native vectorization path with "
357              "support for outer loop vectorization."));
358 
359 cl::opt<bool>
360     VerifyEachVPlan("vplan-verify-each",
361 #ifdef EXPENSIVE_CHECKS
362                     cl::init(true),
363 #else
364                     cl::init(false),
365 #endif
366                     cl::Hidden,
367                     cl::desc("Verfiy VPlans after VPlan transforms."));
368 } // namespace llvm
369 
370 // This flag enables the stress testing of the VPlan H-CFG construction in the
371 // VPlan-native vectorization path. It must be used in conjuction with
372 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
373 // verification of the H-CFGs built.
374 static cl::opt<bool> VPlanBuildStressTest(
375     "vplan-build-stress-test", cl::init(false), cl::Hidden,
376     cl::desc(
377         "Build VPlan for every supported loop nest in the function and bail "
378         "out right after the build (stress test the VPlan H-CFG construction "
379         "in the VPlan-native vectorization path)."));
380 
381 cl::opt<bool> llvm::EnableLoopInterleaving(
382     "interleave-loops", cl::init(true), cl::Hidden,
383     cl::desc("Enable loop interleaving in Loop vectorization passes"));
384 cl::opt<bool> llvm::EnableLoopVectorization(
385     "vectorize-loops", cl::init(true), cl::Hidden,
386     cl::desc("Run the Loop vectorization passes"));
387 
388 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
389     "force-widen-divrem-via-safe-divisor", cl::Hidden,
390     cl::desc(
391         "Override cost based safe divisor widening for div/rem instructions"));
392 
393 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
394     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
395     cl::Hidden,
396     cl::desc("Try wider VFs if they enable the use of vector variants"));
397 
398 static cl::opt<bool> EnableEarlyExitVectorization(
399     "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
400     cl::desc(
401         "Enable vectorization of early exit loops with uncountable exits."));
402 
403 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
404 // variables not overflowing do not hold. See `emitSCEVChecks`.
405 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
406 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
407 // `emitMemRuntimeChecks`.
408 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
409 // Likelyhood of bypassing the vectorized loop because there are zero trips left
410 // after prolog. See `emitIterationCountCheck`.
411 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
412 
413 /// A helper function that returns true if the given type is irregular. The
414 /// type is irregular if its allocated size doesn't equal the store size of an
415 /// element of the corresponding vector type.
416 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
417   // Determine if an array of N elements of type Ty is "bitcast compatible"
418   // with a <N x Ty> vector.
419   // This is only true if there is no padding between the array elements.
420   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
421 }
422 
423 /// Returns "best known" trip count for the specified loop \p L as defined by
424 /// the following procedure:
425 ///   1) Returns exact trip count if it is known.
426 ///   2) Returns expected trip count according to profile data if any.
427 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
428 ///   4) Returns std::nullopt if all of the above failed.
429 static std::optional<unsigned>
430 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
431                     bool CanUseConstantMax = true) {
432   // Check if exact trip count is known.
433   if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
434     return ExpectedTC;
435 
436   // Check if there is an expected trip count available from profile data.
437   if (LoopVectorizeWithBlockFrequency)
438     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
439       return *EstimatedTC;
440 
441   if (!CanUseConstantMax)
442     return std::nullopt;
443 
444   // Check if upper bound estimate is known.
445   if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
446     return ExpectedTC;
447 
448   return std::nullopt;
449 }
450 
451 namespace {
452 // Forward declare GeneratedRTChecks.
453 class GeneratedRTChecks;
454 
455 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
456 } // namespace
457 
458 namespace llvm {
459 
460 AnalysisKey ShouldRunExtraVectorPasses::Key;
461 
462 /// InnerLoopVectorizer vectorizes loops which contain only one basic
463 /// block to a specified vectorization factor (VF).
464 /// This class performs the widening of scalars into vectors, or multiple
465 /// scalars. This class also implements the following features:
466 /// * It inserts an epilogue loop for handling loops that don't have iteration
467 ///   counts that are known to be a multiple of the vectorization factor.
468 /// * It handles the code generation for reduction variables.
469 /// * Scalarization (implementation using scalars) of un-vectorizable
470 ///   instructions.
471 /// InnerLoopVectorizer does not perform any vectorization-legality
472 /// checks, and relies on the caller to check for the different legality
473 /// aspects. The InnerLoopVectorizer relies on the
474 /// LoopVectorizationLegality class to provide information about the induction
475 /// and reduction variables that were found to a given vectorization factor.
476 class InnerLoopVectorizer {
477 public:
478   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
479                       LoopInfo *LI, DominatorTree *DT,
480                       const TargetLibraryInfo *TLI,
481                       const TargetTransformInfo *TTI, AssumptionCache *AC,
482                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
483                       ElementCount MinProfitableTripCount,
484                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
485                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
486                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
487                       VPlan &Plan)
488       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
489         AC(AC), ORE(ORE), VF(VecWidth),
490         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
491         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
492         PSI(PSI), RTChecks(RTChecks), Plan(Plan),
493         VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
494     // Query this against the original loop and save it here because the profile
495     // of the original loop header may change as the transformation happens.
496     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
497         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
498   }
499 
500   virtual ~InnerLoopVectorizer() = default;
501 
502   /// Create a new empty loop that will contain vectorized instructions later
503   /// on, while the old loop will be used as the scalar remainder. Control flow
504   /// is generated around the vectorized (and scalar epilogue) loops consisting
505   /// of various checks and bypasses. Return the pre-header block of the new
506   /// loop. In the case of epilogue vectorization, this function is overriden to
507   /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
508   /// used to look up SCEV expansions for expressions needed during skeleton
509   /// creation.
510   virtual BasicBlock *
511   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
512 
513   /// Fix the vectorized code, taking care of header phi's, and more.
514   void fixVectorizedLoop(VPTransformState &State);
515 
516   // Return true if any runtime check is added.
517   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
518 
519   /// A helper function to scalarize a single Instruction in the innermost loop.
520   /// Generates a sequence of scalar instances for each lane between \p MinLane
521   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
522   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
523   /// Instr's operands.
524   void scalarizeInstruction(const Instruction *Instr,
525                             VPReplicateRecipe *RepRecipe, const VPLane &Lane,
526                             VPTransformState &State);
527 
528   /// Fix the non-induction PHIs in \p Plan.
529   void fixNonInductionPHIs(VPTransformState &State);
530 
531   /// Returns the original loop trip count.
532   Value *getTripCount() const { return TripCount; }
533 
534   /// Used to set the trip count after ILV's construction and after the
535   /// preheader block has been executed. Note that this always holds the trip
536   /// count of the original loop for both main loop and epilogue vectorization.
537   void setTripCount(Value *TC) { TripCount = TC; }
538 
539   // Retrieve the additional bypass value associated with an original
540   /// induction header phi.
541   Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
542     return Induction2AdditionalBypassValue.at(OrigPhi);
543   }
544 
545   /// Return the additional bypass block which targets the scalar loop by
546   /// skipping the epilogue loop after completing the main loop.
547   BasicBlock *getAdditionalBypassBlock() const {
548     assert(AdditionalBypassBlock &&
549            "Trying to access AdditionalBypassBlock but it has not been set");
550     return AdditionalBypassBlock;
551   }
552 
553 protected:
554   friend class LoopVectorizationPlanner;
555 
556   /// Iteratively sink the scalarized operands of a predicated instruction into
557   /// the block that was created for it.
558   void sinkScalarOperands(Instruction *PredInst);
559 
560   /// Returns (and creates if needed) the trip count of the widened loop.
561   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
562 
563   /// Emit a bypass check to see if the vector trip count is zero, including if
564   /// it overflows.
565   void emitIterationCountCheck(BasicBlock *Bypass);
566 
567   /// Emit a bypass check to see if all of the SCEV assumptions we've
568   /// had to make are correct. Returns the block containing the checks or
569   /// nullptr if no checks have been added.
570   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
571 
572   /// Emit bypass checks to check any memory assumptions we may have made.
573   /// Returns the block containing the checks or nullptr if no checks have been
574   /// added.
575   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
576 
577   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
578   /// vector loop preheader, middle block and scalar preheader.
579   void createVectorLoopSkeleton(StringRef Prefix);
580 
581   /// Create and record the values for induction variables to resume coming from
582   /// the additional bypass block.
583   void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
584                                              Value *MainVectorTripCount);
585 
586   /// Allow subclasses to override and print debug traces before/after vplan
587   /// execution, when trace information is requested.
588   virtual void printDebugTracesAtStart() {}
589   virtual void printDebugTracesAtEnd() {}
590 
591   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
592   /// vector preheader and its predecessor, also connecting the new block to the
593   /// scalar preheader.
594   void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
595 
596   /// The original loop.
597   Loop *OrigLoop;
598 
599   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
600   /// dynamic knowledge to simplify SCEV expressions and converts them to a
601   /// more usable form.
602   PredicatedScalarEvolution &PSE;
603 
604   /// Loop Info.
605   LoopInfo *LI;
606 
607   /// Dominator Tree.
608   DominatorTree *DT;
609 
610   /// Target Library Info.
611   const TargetLibraryInfo *TLI;
612 
613   /// Target Transform Info.
614   const TargetTransformInfo *TTI;
615 
616   /// Assumption Cache.
617   AssumptionCache *AC;
618 
619   /// Interface to emit optimization remarks.
620   OptimizationRemarkEmitter *ORE;
621 
622   /// The vectorization SIMD factor to use. Each vector will have this many
623   /// vector elements.
624   ElementCount VF;
625 
626   ElementCount MinProfitableTripCount;
627 
628   /// The vectorization unroll factor to use. Each scalar is vectorized to this
629   /// many different vector instructions.
630   unsigned UF;
631 
632   /// The builder that we use
633   IRBuilder<> Builder;
634 
635   // --- Vectorization state ---
636 
637   /// The vector-loop preheader.
638   BasicBlock *LoopVectorPreHeader;
639 
640   /// The scalar-loop preheader.
641   BasicBlock *LoopScalarPreHeader;
642 
643   /// Middle Block between the vector and the scalar.
644   BasicBlock *LoopMiddleBlock;
645 
646   /// A list of all bypass blocks. The first block is the entry of the loop.
647   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
648 
649   /// Store instructions that were predicated.
650   SmallVector<Instruction *, 4> PredicatedInstructions;
651 
652   /// Trip count of the original loop.
653   Value *TripCount = nullptr;
654 
655   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
656   Value *VectorTripCount = nullptr;
657 
658   /// The legality analysis.
659   LoopVectorizationLegality *Legal;
660 
661   /// The profitablity analysis.
662   LoopVectorizationCostModel *Cost;
663 
664   // Record whether runtime checks are added.
665   bool AddedSafetyChecks = false;
666 
667   /// BFI and PSI are used to check for profile guided size optimizations.
668   BlockFrequencyInfo *BFI;
669   ProfileSummaryInfo *PSI;
670 
671   // Whether this loop should be optimized for size based on profile guided size
672   // optimizatios.
673   bool OptForSizeBasedOnProfile;
674 
675   /// Structure to hold information about generated runtime checks, responsible
676   /// for cleaning the checks, if vectorization turns out unprofitable.
677   GeneratedRTChecks &RTChecks;
678 
679   /// Mapping of induction phis to their additional bypass values. They
680   /// need to be added as operands to phi nodes in the scalar loop preheader
681   /// after the epilogue skeleton has been created.
682   DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
683 
684   /// The additional bypass block which conditionally skips over the epilogue
685   /// loop after executing the main loop. Needed to resume inductions and
686   /// reductions during epilogue vectorization.
687   BasicBlock *AdditionalBypassBlock = nullptr;
688 
689   VPlan &Plan;
690 
691   /// The vector preheader block of \p Plan, used as target for check blocks
692   /// introduced during skeleton creation.
693   VPBlockBase *VectorPHVPB;
694 };
695 
696 /// Encapsulate information regarding vectorization of a loop and its epilogue.
697 /// This information is meant to be updated and used across two stages of
698 /// epilogue vectorization.
699 struct EpilogueLoopVectorizationInfo {
700   ElementCount MainLoopVF = ElementCount::getFixed(0);
701   unsigned MainLoopUF = 0;
702   ElementCount EpilogueVF = ElementCount::getFixed(0);
703   unsigned EpilogueUF = 0;
704   BasicBlock *MainLoopIterationCountCheck = nullptr;
705   BasicBlock *EpilogueIterationCountCheck = nullptr;
706   BasicBlock *SCEVSafetyCheck = nullptr;
707   BasicBlock *MemSafetyCheck = nullptr;
708   Value *TripCount = nullptr;
709   Value *VectorTripCount = nullptr;
710   VPlan &EpiloguePlan;
711 
712   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
713                                 ElementCount EVF, unsigned EUF,
714                                 VPlan &EpiloguePlan)
715       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
716         EpiloguePlan(EpiloguePlan) {
717     assert(EUF == 1 &&
718            "A high UF for the epilogue loop is likely not beneficial.");
719   }
720 };
721 
722 /// An extension of the inner loop vectorizer that creates a skeleton for a
723 /// vectorized loop that has its epilogue (residual) also vectorized.
724 /// The idea is to run the vplan on a given loop twice, firstly to setup the
725 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
726 /// from the first step and vectorize the epilogue.  This is achieved by
727 /// deriving two concrete strategy classes from this base class and invoking
728 /// them in succession from the loop vectorizer planner.
729 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
730 public:
731   InnerLoopAndEpilogueVectorizer(
732       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
733       DominatorTree *DT, const TargetLibraryInfo *TLI,
734       const TargetTransformInfo *TTI, AssumptionCache *AC,
735       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
736       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
737       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
738       GeneratedRTChecks &Checks, VPlan &Plan)
739       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
740                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
741                             CM, BFI, PSI, Checks, Plan),
742         EPI(EPI) {}
743 
744   // Override this function to handle the more complex control flow around the
745   // three loops.
746   BasicBlock *
747   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
748     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
749   }
750 
751   /// The interface for creating a vectorized skeleton using one of two
752   /// different strategies, each corresponding to one execution of the vplan
753   /// as described above.
754   virtual BasicBlock *
755   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
756 
757   /// Holds and updates state information required to vectorize the main loop
758   /// and its epilogue in two separate passes. This setup helps us avoid
759   /// regenerating and recomputing runtime safety checks. It also helps us to
760   /// shorten the iteration-count-check path length for the cases where the
761   /// iteration count of the loop is so small that the main vector loop is
762   /// completely skipped.
763   EpilogueLoopVectorizationInfo &EPI;
764 };
765 
766 /// A specialized derived class of inner loop vectorizer that performs
767 /// vectorization of *main* loops in the process of vectorizing loops and their
768 /// epilogues.
769 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
770 public:
771   EpilogueVectorizerMainLoop(
772       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
773       DominatorTree *DT, const TargetLibraryInfo *TLI,
774       const TargetTransformInfo *TTI, AssumptionCache *AC,
775       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
776       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
777       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
778       GeneratedRTChecks &Check, VPlan &Plan)
779       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
780                                        EPI, LVL, CM, BFI, PSI, Check, Plan) {}
781   /// Implements the interface for creating a vectorized skeleton using the
782   /// *main loop* strategy (ie the first pass of vplan execution).
783   BasicBlock *
784   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
785 
786 protected:
787   /// Emits an iteration count bypass check once for the main loop (when \p
788   /// ForEpilogue is false) and once for the epilogue loop (when \p
789   /// ForEpilogue is true).
790   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
791   void printDebugTracesAtStart() override;
792   void printDebugTracesAtEnd() override;
793 };
794 
795 // A specialized derived class of inner loop vectorizer that performs
796 // vectorization of *epilogue* loops in the process of vectorizing loops and
797 // their epilogues.
798 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
799 public:
800   EpilogueVectorizerEpilogueLoop(
801       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
802       DominatorTree *DT, const TargetLibraryInfo *TLI,
803       const TargetTransformInfo *TTI, AssumptionCache *AC,
804       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
805       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
806       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
807       GeneratedRTChecks &Checks, VPlan &Plan)
808       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
809                                        EPI, LVL, CM, BFI, PSI, Checks, Plan) {
810     TripCount = EPI.TripCount;
811   }
812   /// Implements the interface for creating a vectorized skeleton using the
813   /// *epilogue loop* strategy (ie the second pass of vplan execution).
814   BasicBlock *
815   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
816 
817 protected:
818   /// Emits an iteration count bypass check after the main vector loop has
819   /// finished to see if there are any iterations left to execute by either
820   /// the vector epilogue or the scalar epilogue.
821   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
822                                                       BasicBlock *Bypass,
823                                                       BasicBlock *Insert);
824   void printDebugTracesAtStart() override;
825   void printDebugTracesAtEnd() override;
826 };
827 } // end namespace llvm
828 
829 /// Look for a meaningful debug location on the instruction or its operands.
830 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
831   if (!I)
832     return DebugLoc();
833 
834   DebugLoc Empty;
835   if (I->getDebugLoc() != Empty)
836     return I->getDebugLoc();
837 
838   for (Use &Op : I->operands()) {
839     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
840       if (OpInst->getDebugLoc() != Empty)
841         return OpInst->getDebugLoc();
842   }
843 
844   return I->getDebugLoc();
845 }
846 
847 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
848 /// is passed, the message relates to that particular instruction.
849 #ifndef NDEBUG
850 static void debugVectorizationMessage(const StringRef Prefix,
851                                       const StringRef DebugMsg,
852                                       Instruction *I) {
853   dbgs() << "LV: " << Prefix << DebugMsg;
854   if (I != nullptr)
855     dbgs() << " " << *I;
856   else
857     dbgs() << '.';
858   dbgs() << '\n';
859 }
860 #endif
861 
862 /// Create an analysis remark that explains why vectorization failed
863 ///
864 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
865 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
866 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
867 /// the location of the remark. If \p DL is passed, use it as debug location for
868 /// the remark. \return the remark object that can be streamed to.
869 static OptimizationRemarkAnalysis
870 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
871                  Instruction *I, DebugLoc DL = {}) {
872   Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
873   // If debug location is attached to the instruction, use it. Otherwise if DL
874   // was not provided, use the loop's.
875   if (I && I->getDebugLoc())
876     DL = I->getDebugLoc();
877   else if (!DL)
878     DL = TheLoop->getStartLoc();
879 
880   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
881 }
882 
883 namespace llvm {
884 
885 /// Return a value for Step multiplied by VF.
886 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
887                        int64_t Step) {
888   assert(Ty->isIntegerTy() && "Expected an integer step");
889   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
890 }
891 
892 /// Return the runtime value for VF.
893 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
894   return B.CreateElementCount(Ty, VF);
895 }
896 
897 void reportVectorizationFailure(const StringRef DebugMsg,
898                                 const StringRef OREMsg, const StringRef ORETag,
899                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
900                                 Instruction *I) {
901   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
902   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
903   ORE->emit(
904       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
905       << "loop not vectorized: " << OREMsg);
906 }
907 
908 /// Reports an informative message: print \p Msg for debugging purposes as well
909 /// as an optimization remark. Uses either \p I as location of the remark, or
910 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
911 /// remark. If \p DL is passed, use it as debug location for the remark.
912 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
913                                     OptimizationRemarkEmitter *ORE,
914                                     Loop *TheLoop, Instruction *I = nullptr,
915                                     DebugLoc DL = {}) {
916   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
917   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
918   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
919                              I, DL)
920             << Msg);
921 }
922 
923 /// Report successful vectorization of the loop. In case an outer loop is
924 /// vectorized, prepend "outer" to the vectorization remark.
925 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
926                                 VectorizationFactor VF, unsigned IC) {
927   LLVM_DEBUG(debugVectorizationMessage(
928       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
929       nullptr));
930   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
931   ORE->emit([&]() {
932     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
933                               TheLoop->getHeader())
934            << "vectorized " << LoopType << "loop (vectorization width: "
935            << ore::NV("VectorizationFactor", VF.Width)
936            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
937   });
938 }
939 
940 } // end namespace llvm
941 
942 namespace llvm {
943 
944 // Loop vectorization cost-model hints how the scalar epilogue loop should be
945 // lowered.
946 enum ScalarEpilogueLowering {
947 
948   // The default: allowing scalar epilogues.
949   CM_ScalarEpilogueAllowed,
950 
951   // Vectorization with OptForSize: don't allow epilogues.
952   CM_ScalarEpilogueNotAllowedOptSize,
953 
954   // A special case of vectorisation with OptForSize: loops with a very small
955   // trip count are considered for vectorization under OptForSize, thereby
956   // making sure the cost of their loop body is dominant, free of runtime
957   // guards and scalar iteration overheads.
958   CM_ScalarEpilogueNotAllowedLowTripLoop,
959 
960   // Loop hint predicate indicating an epilogue is undesired.
961   CM_ScalarEpilogueNotNeededUsePredicate,
962 
963   // Directive indicating we must either tail fold or not vectorize
964   CM_ScalarEpilogueNotAllowedUsePredicate
965 };
966 
967 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
968 
969 /// LoopVectorizationCostModel - estimates the expected speedups due to
970 /// vectorization.
971 /// In many cases vectorization is not profitable. This can happen because of
972 /// a number of reasons. In this class we mainly attempt to predict the
973 /// expected speedup/slowdowns due to the supported instruction set. We use the
974 /// TargetTransformInfo to query the different backends for the cost of
975 /// different operations.
976 class LoopVectorizationCostModel {
977   friend class LoopVectorizationPlanner;
978 
979 public:
980   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
981                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
982                              LoopVectorizationLegality *Legal,
983                              const TargetTransformInfo &TTI,
984                              const TargetLibraryInfo *TLI, DemandedBits *DB,
985                              AssumptionCache *AC,
986                              OptimizationRemarkEmitter *ORE, const Function *F,
987                              const LoopVectorizeHints *Hints,
988                              InterleavedAccessInfo &IAI)
989       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
990         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
991         Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
992 
993   /// \return An upper bound for the vectorization factors (both fixed and
994   /// scalable). If the factors are 0, vectorization and interleaving should be
995   /// avoided up front.
996   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
997 
998   /// \return True if runtime checks are required for vectorization, and false
999   /// otherwise.
1000   bool runtimeChecksRequired();
1001 
1002   /// Setup cost-based decisions for user vectorization factor.
1003   /// \return true if the UserVF is a feasible VF to be chosen.
1004   bool selectUserVectorizationFactor(ElementCount UserVF) {
1005     collectUniformsAndScalars(UserVF);
1006     collectInstsToScalarize(UserVF);
1007     return expectedCost(UserVF).isValid();
1008   }
1009 
1010   /// \return The size (in bits) of the smallest and widest types in the code
1011   /// that needs to be vectorized. We ignore values that remain scalar such as
1012   /// 64 bit loop indices.
1013   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1014 
1015   /// \return The desired interleave count.
1016   /// If interleave count has been specified by metadata it will be returned.
1017   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1018   /// are the selected vectorization factor and the cost of the selected VF.
1019   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1020 
1021   /// Memory access instruction may be vectorized in more than one way.
1022   /// Form of instruction after vectorization depends on cost.
1023   /// This function takes cost-based decisions for Load/Store instructions
1024   /// and collects them in a map. This decisions map is used for building
1025   /// the lists of loop-uniform and loop-scalar instructions.
1026   /// The calculated cost is saved with widening decision in order to
1027   /// avoid redundant calculations.
1028   void setCostBasedWideningDecision(ElementCount VF);
1029 
1030   /// A call may be vectorized in different ways depending on whether we have
1031   /// vectorized variants available and whether the target supports masking.
1032   /// This function analyzes all calls in the function at the supplied VF,
1033   /// makes a decision based on the costs of available options, and stores that
1034   /// decision in a map for use in planning and plan execution.
1035   void setVectorizedCallDecision(ElementCount VF);
1036 
1037   /// A struct that represents some properties of the register usage
1038   /// of a loop.
1039   struct RegisterUsage {
1040     /// Holds the number of loop invariant values that are used in the loop.
1041     /// The key is ClassID of target-provided register class.
1042     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1043     /// Holds the maximum number of concurrent live intervals in the loop.
1044     /// The key is ClassID of target-provided register class.
1045     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1046   };
1047 
1048   /// \return Returns information about the register usages of the loop for the
1049   /// given vectorization factors.
1050   SmallVector<RegisterUsage, 8>
1051   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1052 
1053   /// Collect values we want to ignore in the cost model.
1054   void collectValuesToIgnore();
1055 
1056   /// Collect all element types in the loop for which widening is needed.
1057   void collectElementTypesForWidening();
1058 
1059   /// Split reductions into those that happen in the loop, and those that happen
1060   /// outside. In loop reductions are collected into InLoopReductions.
1061   void collectInLoopReductions();
1062 
1063   /// Returns true if we should use strict in-order reductions for the given
1064   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1065   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1066   /// of FP operations.
1067   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1068     return !Hints->allowReordering() && RdxDesc.isOrdered();
1069   }
1070 
1071   /// \returns The smallest bitwidth each instruction can be represented with.
1072   /// The vector equivalents of these instructions should be truncated to this
1073   /// type.
1074   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1075     return MinBWs;
1076   }
1077 
1078   /// \returns True if it is more profitable to scalarize instruction \p I for
1079   /// vectorization factor \p VF.
1080   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1081     assert(VF.isVector() &&
1082            "Profitable to scalarize relevant only for VF > 1.");
1083     assert(
1084         TheLoop->isInnermost() &&
1085         "cost-model should not be used for outer loops (in VPlan-native path)");
1086 
1087     auto Scalars = InstsToScalarize.find(VF);
1088     assert(Scalars != InstsToScalarize.end() &&
1089            "VF not yet analyzed for scalarization profitability");
1090     return Scalars->second.contains(I);
1091   }
1092 
1093   /// Returns true if \p I is known to be uniform after vectorization.
1094   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1095     assert(
1096         TheLoop->isInnermost() &&
1097         "cost-model should not be used for outer loops (in VPlan-native path)");
1098     // Pseudo probe needs to be duplicated for each unrolled iteration and
1099     // vector lane so that profiled loop trip count can be accurately
1100     // accumulated instead of being under counted.
1101     if (isa<PseudoProbeInst>(I))
1102       return false;
1103 
1104     if (VF.isScalar())
1105       return true;
1106 
1107     auto UniformsPerVF = Uniforms.find(VF);
1108     assert(UniformsPerVF != Uniforms.end() &&
1109            "VF not yet analyzed for uniformity");
1110     return UniformsPerVF->second.count(I);
1111   }
1112 
1113   /// Returns true if \p I is known to be scalar after vectorization.
1114   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1115     assert(
1116         TheLoop->isInnermost() &&
1117         "cost-model should not be used for outer loops (in VPlan-native path)");
1118     if (VF.isScalar())
1119       return true;
1120 
1121     auto ScalarsPerVF = Scalars.find(VF);
1122     assert(ScalarsPerVF != Scalars.end() &&
1123            "Scalar values are not calculated for VF");
1124     return ScalarsPerVF->second.count(I);
1125   }
1126 
1127   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1128   /// for vectorization factor \p VF.
1129   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1130     return VF.isVector() && MinBWs.contains(I) &&
1131            !isProfitableToScalarize(I, VF) &&
1132            !isScalarAfterVectorization(I, VF);
1133   }
1134 
1135   /// Decision that was taken during cost calculation for memory instruction.
1136   enum InstWidening {
1137     CM_Unknown,
1138     CM_Widen,         // For consecutive accesses with stride +1.
1139     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1140     CM_Interleave,
1141     CM_GatherScatter,
1142     CM_Scalarize,
1143     CM_VectorCall,
1144     CM_IntrinsicCall
1145   };
1146 
1147   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1148   /// instruction \p I and vector width \p VF.
1149   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1150                            InstructionCost Cost) {
1151     assert(VF.isVector() && "Expected VF >=2");
1152     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1153   }
1154 
1155   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1156   /// interleaving group \p Grp and vector width \p VF.
1157   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1158                            ElementCount VF, InstWidening W,
1159                            InstructionCost Cost) {
1160     assert(VF.isVector() && "Expected VF >=2");
1161     /// Broadcast this decicion to all instructions inside the group.
1162     /// When interleaving, the cost will only be assigned one instruction, the
1163     /// insert position. For other cases, add the appropriate fraction of the
1164     /// total cost to each instruction. This ensures accurate costs are used,
1165     /// even if the insert position instruction is not used.
1166     InstructionCost InsertPosCost = Cost;
1167     InstructionCost OtherMemberCost = 0;
1168     if (W != CM_Interleave)
1169       OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1170     ;
1171     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1172       if (auto *I = Grp->getMember(Idx)) {
1173         if (Grp->getInsertPos() == I)
1174           WideningDecisions[std::make_pair(I, VF)] =
1175               std::make_pair(W, InsertPosCost);
1176         else
1177           WideningDecisions[std::make_pair(I, VF)] =
1178               std::make_pair(W, OtherMemberCost);
1179       }
1180     }
1181   }
1182 
1183   /// Return the cost model decision for the given instruction \p I and vector
1184   /// width \p VF. Return CM_Unknown if this instruction did not pass
1185   /// through the cost modeling.
1186   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1187     assert(VF.isVector() && "Expected VF to be a vector VF");
1188     assert(
1189         TheLoop->isInnermost() &&
1190         "cost-model should not be used for outer loops (in VPlan-native path)");
1191 
1192     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1193     auto Itr = WideningDecisions.find(InstOnVF);
1194     if (Itr == WideningDecisions.end())
1195       return CM_Unknown;
1196     return Itr->second.first;
1197   }
1198 
1199   /// Return the vectorization cost for the given instruction \p I and vector
1200   /// width \p VF.
1201   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1202     assert(VF.isVector() && "Expected VF >=2");
1203     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1204     assert(WideningDecisions.contains(InstOnVF) &&
1205            "The cost is not calculated");
1206     return WideningDecisions[InstOnVF].second;
1207   }
1208 
1209   struct CallWideningDecision {
1210     InstWidening Kind;
1211     Function *Variant;
1212     Intrinsic::ID IID;
1213     std::optional<unsigned> MaskPos;
1214     InstructionCost Cost;
1215   };
1216 
1217   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1218                                Function *Variant, Intrinsic::ID IID,
1219                                std::optional<unsigned> MaskPos,
1220                                InstructionCost Cost) {
1221     assert(!VF.isScalar() && "Expected vector VF");
1222     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1223                                                      MaskPos, Cost};
1224   }
1225 
1226   CallWideningDecision getCallWideningDecision(CallInst *CI,
1227                                                ElementCount VF) const {
1228     assert(!VF.isScalar() && "Expected vector VF");
1229     return CallWideningDecisions.at(std::make_pair(CI, VF));
1230   }
1231 
1232   /// Return True if instruction \p I is an optimizable truncate whose operand
1233   /// is an induction variable. Such a truncate will be removed by adding a new
1234   /// induction variable with the destination type.
1235   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1236     // If the instruction is not a truncate, return false.
1237     auto *Trunc = dyn_cast<TruncInst>(I);
1238     if (!Trunc)
1239       return false;
1240 
1241     // Get the source and destination types of the truncate.
1242     Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1243     Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1244 
1245     // If the truncate is free for the given types, return false. Replacing a
1246     // free truncate with an induction variable would add an induction variable
1247     // update instruction to each iteration of the loop. We exclude from this
1248     // check the primary induction variable since it will need an update
1249     // instruction regardless.
1250     Value *Op = Trunc->getOperand(0);
1251     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1252       return false;
1253 
1254     // If the truncated value is not an induction variable, return false.
1255     return Legal->isInductionPhi(Op);
1256   }
1257 
1258   /// Collects the instructions to scalarize for each predicated instruction in
1259   /// the loop.
1260   void collectInstsToScalarize(ElementCount VF);
1261 
1262   /// Collect Uniform and Scalar values for the given \p VF.
1263   /// The sets depend on CM decision for Load/Store instructions
1264   /// that may be vectorized as interleave, gather-scatter or scalarized.
1265   /// Also make a decision on what to do about call instructions in the loop
1266   /// at that VF -- scalarize, call a known vector routine, or call a
1267   /// vector intrinsic.
1268   void collectUniformsAndScalars(ElementCount VF) {
1269     // Do the analysis once.
1270     if (VF.isScalar() || Uniforms.contains(VF))
1271       return;
1272     setCostBasedWideningDecision(VF);
1273     collectLoopUniforms(VF);
1274     setVectorizedCallDecision(VF);
1275     collectLoopScalars(VF);
1276   }
1277 
1278   /// Returns true if the target machine supports masked store operation
1279   /// for the given \p DataType and kind of access to \p Ptr.
1280   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1281     return Legal->isConsecutivePtr(DataType, Ptr) &&
1282            TTI.isLegalMaskedStore(DataType, Alignment);
1283   }
1284 
1285   /// Returns true if the target machine supports masked load operation
1286   /// for the given \p DataType and kind of access to \p Ptr.
1287   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1288     return Legal->isConsecutivePtr(DataType, Ptr) &&
1289            TTI.isLegalMaskedLoad(DataType, Alignment);
1290   }
1291 
1292   /// Returns true if the target machine can represent \p V as a masked gather
1293   /// or scatter operation.
1294   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1295     bool LI = isa<LoadInst>(V);
1296     bool SI = isa<StoreInst>(V);
1297     if (!LI && !SI)
1298       return false;
1299     auto *Ty = getLoadStoreType(V);
1300     Align Align = getLoadStoreAlignment(V);
1301     if (VF.isVector())
1302       Ty = VectorType::get(Ty, VF);
1303     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1304            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1305   }
1306 
1307   /// Returns true if the target machine supports all of the reduction
1308   /// variables found for the given VF.
1309   bool canVectorizeReductions(ElementCount VF) const {
1310     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1311       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1312       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1313     }));
1314   }
1315 
1316   /// Given costs for both strategies, return true if the scalar predication
1317   /// lowering should be used for div/rem.  This incorporates an override
1318   /// option so it is not simply a cost comparison.
1319   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1320                                      InstructionCost SafeDivisorCost) const {
1321     switch (ForceSafeDivisor) {
1322     case cl::BOU_UNSET:
1323       return ScalarCost < SafeDivisorCost;
1324     case cl::BOU_TRUE:
1325       return false;
1326     case cl::BOU_FALSE:
1327       return true;
1328     }
1329     llvm_unreachable("impossible case value");
1330   }
1331 
1332   /// Returns true if \p I is an instruction which requires predication and
1333   /// for which our chosen predication strategy is scalarization (i.e. we
1334   /// don't have an alternate strategy such as masking available).
1335   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1336   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1337 
1338   /// Returns true if \p I is an instruction that needs to be predicated
1339   /// at runtime.  The result is independent of the predication mechanism.
1340   /// Superset of instructions that return true for isScalarWithPredication.
1341   bool isPredicatedInst(Instruction *I) const;
1342 
1343   /// Return the costs for our two available strategies for lowering a
1344   /// div/rem operation which requires speculating at least one lane.
1345   /// First result is for scalarization (will be invalid for scalable
1346   /// vectors); second is for the safe-divisor strategy.
1347   std::pair<InstructionCost, InstructionCost>
1348   getDivRemSpeculationCost(Instruction *I,
1349                            ElementCount VF) const;
1350 
1351   /// Returns true if \p I is a memory instruction with consecutive memory
1352   /// access that can be widened.
1353   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1354 
1355   /// Returns true if \p I is a memory instruction in an interleaved-group
1356   /// of memory accesses that can be vectorized with wide vector loads/stores
1357   /// and shuffles.
1358   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1359 
1360   /// Check if \p Instr belongs to any interleaved access group.
1361   bool isAccessInterleaved(Instruction *Instr) const {
1362     return InterleaveInfo.isInterleaved(Instr);
1363   }
1364 
1365   /// Get the interleaved access group that \p Instr belongs to.
1366   const InterleaveGroup<Instruction> *
1367   getInterleavedAccessGroup(Instruction *Instr) const {
1368     return InterleaveInfo.getInterleaveGroup(Instr);
1369   }
1370 
1371   /// Returns true if we're required to use a scalar epilogue for at least
1372   /// the final iteration of the original loop.
1373   bool requiresScalarEpilogue(bool IsVectorizing) const {
1374     if (!isScalarEpilogueAllowed()) {
1375       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1376       return false;
1377     }
1378     // If we might exit from anywhere but the latch and early exit vectorization
1379     // is disabled, we must run the exiting iteration in scalar form.
1380     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1381         !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1382       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1383                            "from latch block\n");
1384       return true;
1385     }
1386     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1387       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1388                            "interleaved group requires scalar epilogue\n");
1389       return true;
1390     }
1391     LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1392     return false;
1393   }
1394 
1395   /// Returns true if we're required to use a scalar epilogue for at least
1396   /// the final iteration of the original loop for all VFs in \p Range.
1397   /// A scalar epilogue must either be required for all VFs in \p Range or for
1398   /// none.
1399   bool requiresScalarEpilogue(VFRange Range) const {
1400     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1401       return requiresScalarEpilogue(VF.isVector());
1402     };
1403     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1404     assert(
1405         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1406         "all VFs in range must agree on whether a scalar epilogue is required");
1407     return IsRequired;
1408   }
1409 
1410   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1411   /// loop hint annotation.
1412   bool isScalarEpilogueAllowed() const {
1413     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1414   }
1415 
1416   /// Returns the TailFoldingStyle that is best for the current loop.
1417   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1418     if (!ChosenTailFoldingStyle)
1419       return TailFoldingStyle::None;
1420     return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1421                                : ChosenTailFoldingStyle->second;
1422   }
1423 
1424   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1425   /// overflow or not.
1426   /// \param IsScalableVF true if scalable vector factors enabled.
1427   /// \param UserIC User specific interleave count.
1428   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1429     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1430     if (!Legal->canFoldTailByMasking()) {
1431       ChosenTailFoldingStyle =
1432           std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1433       return;
1434     }
1435 
1436     if (!ForceTailFoldingStyle.getNumOccurrences()) {
1437       ChosenTailFoldingStyle = std::make_pair(
1438           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1439           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1440       return;
1441     }
1442 
1443     // Set styles when forced.
1444     ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1445                                             ForceTailFoldingStyle.getValue());
1446     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1447       return;
1448     // Override forced styles if needed.
1449     // FIXME: use actual opcode/data type for analysis here.
1450     // FIXME: Investigate opportunity for fixed vector factor.
1451     // FIXME: support fixed-order recurrences by fixing splice of non VFxUF
1452     // penultimate EVL.
1453     bool EVLIsLegal =
1454         UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1455         !EnableVPlanNativePath && Legal->getFixedOrderRecurrences().empty();
1456     if (!EVLIsLegal) {
1457       // If for some reason EVL mode is unsupported, fallback to
1458       // DataWithoutLaneMask to try to vectorize the loop with folded tail
1459       // in a generic way.
1460       ChosenTailFoldingStyle =
1461           std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1462                          TailFoldingStyle::DataWithoutLaneMask);
1463       LLVM_DEBUG(
1464           dbgs()
1465           << "LV: Preference for VP intrinsics indicated. Will "
1466              "not try to generate VP Intrinsics "
1467           << (UserIC > 1
1468                   ? "since interleave count specified is greater than 1.\n"
1469                   : "due to non-interleaving reasons.\n"));
1470     }
1471   }
1472 
1473   /// Returns true if all loop blocks should be masked to fold tail loop.
1474   bool foldTailByMasking() const {
1475     // TODO: check if it is possible to check for None style independent of
1476     // IVUpdateMayOverflow flag in getTailFoldingStyle.
1477     return getTailFoldingStyle() != TailFoldingStyle::None;
1478   }
1479 
1480   /// Return maximum safe number of elements to be processed per vector
1481   /// iteration, which do not prevent store-load forwarding and are safe with
1482   /// regard to the memory dependencies. Required for EVL-based VPlans to
1483   /// correctly calculate AVL (application vector length) as min(remaining AVL,
1484   /// MaxSafeElements).
1485   /// TODO: need to consider adjusting cost model to use this value as a
1486   /// vectorization factor for EVL-based vectorization.
1487   std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1488 
1489   /// Returns true if the instructions in this block requires predication
1490   /// for any reason, e.g. because tail folding now requires a predicate
1491   /// or because the block in the original loop was predicated.
1492   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1493     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1494   }
1495 
1496   /// Returns true if VP intrinsics with explicit vector length support should
1497   /// be generated in the tail folded loop.
1498   bool foldTailWithEVL() const {
1499     return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1500   }
1501 
1502   /// Returns true if the Phi is part of an inloop reduction.
1503   bool isInLoopReduction(PHINode *Phi) const {
1504     return InLoopReductions.contains(Phi);
1505   }
1506 
1507   /// Returns true if the predicated reduction select should be used to set the
1508   /// incoming value for the reduction phi.
1509   bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1510     // Force to use predicated reduction select since the EVL of the
1511     // second-to-last iteration might not be VF*UF.
1512     if (foldTailWithEVL())
1513       return true;
1514     return PreferPredicatedReductionSelect ||
1515            TTI.preferPredicatedReductionSelect(
1516                Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1517   }
1518 
1519   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1520   /// with factor VF.  Return the cost of the instruction, including
1521   /// scalarization overhead if it's needed.
1522   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1523 
1524   /// Estimate cost of a call instruction CI if it were vectorized with factor
1525   /// VF. Return the cost of the instruction, including scalarization overhead
1526   /// if it's needed.
1527   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1528 
1529   /// Invalidates decisions already taken by the cost model.
1530   void invalidateCostModelingDecisions() {
1531     WideningDecisions.clear();
1532     CallWideningDecisions.clear();
1533     Uniforms.clear();
1534     Scalars.clear();
1535   }
1536 
1537   /// Returns the expected execution cost. The unit of the cost does
1538   /// not matter because we use the 'cost' units to compare different
1539   /// vector widths. The cost that is returned is *not* normalized by
1540   /// the factor width.
1541   InstructionCost expectedCost(ElementCount VF);
1542 
1543   bool hasPredStores() const { return NumPredStores > 0; }
1544 
1545   /// Returns true if epilogue vectorization is considered profitable, and
1546   /// false otherwise.
1547   /// \p VF is the vectorization factor chosen for the original loop.
1548   /// \p Multiplier is an aditional scaling factor applied to VF before
1549   /// comparing to EpilogueVectorizationMinVF.
1550   bool isEpilogueVectorizationProfitable(const ElementCount VF,
1551                                          const unsigned IC) const;
1552 
1553   /// Returns the execution time cost of an instruction for a given vector
1554   /// width. Vector width of one means scalar.
1555   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1556 
1557   /// Return the cost of instructions in an inloop reduction pattern, if I is
1558   /// part of that pattern.
1559   std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1560                                                          ElementCount VF,
1561                                                          Type *VectorTy) const;
1562 
1563   /// Returns true if \p Op should be considered invariant and if it is
1564   /// trivially hoistable.
1565   bool shouldConsiderInvariant(Value *Op);
1566 
1567 private:
1568   unsigned NumPredStores = 0;
1569 
1570   /// \return An upper bound for the vectorization factors for both
1571   /// fixed and scalable vectorization, where the minimum-known number of
1572   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1573   /// disabled or unsupported, then the scalable part will be equal to
1574   /// ElementCount::getScalable(0).
1575   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1576                                            ElementCount UserVF,
1577                                            bool FoldTailByMasking);
1578 
1579   /// \return the maximized element count based on the targets vector
1580   /// registers and the loop trip-count, but limited to a maximum safe VF.
1581   /// This is a helper function of computeFeasibleMaxVF.
1582   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1583                                        unsigned SmallestType,
1584                                        unsigned WidestType,
1585                                        ElementCount MaxSafeVF,
1586                                        bool FoldTailByMasking);
1587 
1588   /// Checks if scalable vectorization is supported and enabled. Caches the
1589   /// result to avoid repeated debug dumps for repeated queries.
1590   bool isScalableVectorizationAllowed();
1591 
1592   /// \return the maximum legal scalable VF, based on the safe max number
1593   /// of elements.
1594   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1595 
1596   /// Calculate vectorization cost of memory instruction \p I.
1597   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1598 
1599   /// The cost computation for scalarized memory instruction.
1600   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1601 
1602   /// The cost computation for interleaving group of memory instructions.
1603   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1604 
1605   /// The cost computation for Gather/Scatter instruction.
1606   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1607 
1608   /// The cost computation for widening instruction \p I with consecutive
1609   /// memory access.
1610   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1611 
1612   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1613   /// Load: scalar load + broadcast.
1614   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1615   /// element)
1616   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1617 
1618   /// Estimate the overhead of scalarizing an instruction. This is a
1619   /// convenience wrapper for the type-based getScalarizationOverhead API.
1620   InstructionCost getScalarizationOverhead(Instruction *I,
1621                                            ElementCount VF) const;
1622 
1623   /// Returns true if an artificially high cost for emulated masked memrefs
1624   /// should be used.
1625   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1626 
1627   /// Map of scalar integer values to the smallest bitwidth they can be legally
1628   /// represented as. The vector equivalents of these values should be truncated
1629   /// to this type.
1630   MapVector<Instruction *, uint64_t> MinBWs;
1631 
1632   /// A type representing the costs for instructions if they were to be
1633   /// scalarized rather than vectorized. The entries are Instruction-Cost
1634   /// pairs.
1635   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1636 
1637   /// A set containing all BasicBlocks that are known to present after
1638   /// vectorization as a predicated block.
1639   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1640       PredicatedBBsAfterVectorization;
1641 
1642   /// Records whether it is allowed to have the original scalar loop execute at
1643   /// least once. This may be needed as a fallback loop in case runtime
1644   /// aliasing/dependence checks fail, or to handle the tail/remainder
1645   /// iterations when the trip count is unknown or doesn't divide by the VF,
1646   /// or as a peel-loop to handle gaps in interleave-groups.
1647   /// Under optsize and when the trip count is very small we don't allow any
1648   /// iterations to execute in the scalar loop.
1649   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1650 
1651   /// Control finally chosen tail folding style. The first element is used if
1652   /// the IV update may overflow, the second element - if it does not.
1653   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1654       ChosenTailFoldingStyle;
1655 
1656   /// true if scalable vectorization is supported and enabled.
1657   std::optional<bool> IsScalableVectorizationAllowed;
1658 
1659   /// Maximum safe number of elements to be processed per vector iteration,
1660   /// which do not prevent store-load forwarding and are safe with regard to the
1661   /// memory dependencies. Required for EVL-based veectorization, where this
1662   /// value is used as the upper bound of the safe AVL.
1663   std::optional<unsigned> MaxSafeElements;
1664 
1665   /// A map holding scalar costs for different vectorization factors. The
1666   /// presence of a cost for an instruction in the mapping indicates that the
1667   /// instruction will be scalarized when vectorizing with the associated
1668   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1669   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1670 
1671   /// Holds the instructions known to be uniform after vectorization.
1672   /// The data is collected per VF.
1673   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1674 
1675   /// Holds the instructions known to be scalar after vectorization.
1676   /// The data is collected per VF.
1677   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1678 
1679   /// Holds the instructions (address computations) that are forced to be
1680   /// scalarized.
1681   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1682 
1683   /// PHINodes of the reductions that should be expanded in-loop.
1684   SmallPtrSet<PHINode *, 4> InLoopReductions;
1685 
1686   /// A Map of inloop reduction operations and their immediate chain operand.
1687   /// FIXME: This can be removed once reductions can be costed correctly in
1688   /// VPlan. This was added to allow quick lookup of the inloop operations.
1689   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1690 
1691   /// Returns the expected difference in cost from scalarizing the expression
1692   /// feeding a predicated instruction \p PredInst. The instructions to
1693   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1694   /// non-negative return value implies the expression will be scalarized.
1695   /// Currently, only single-use chains are considered for scalarization.
1696   InstructionCost computePredInstDiscount(Instruction *PredInst,
1697                                           ScalarCostsTy &ScalarCosts,
1698                                           ElementCount VF);
1699 
1700   /// Collect the instructions that are uniform after vectorization. An
1701   /// instruction is uniform if we represent it with a single scalar value in
1702   /// the vectorized loop corresponding to each vector iteration. Examples of
1703   /// uniform instructions include pointer operands of consecutive or
1704   /// interleaved memory accesses. Note that although uniformity implies an
1705   /// instruction will be scalar, the reverse is not true. In general, a
1706   /// scalarized instruction will be represented by VF scalar values in the
1707   /// vectorized loop, each corresponding to an iteration of the original
1708   /// scalar loop.
1709   void collectLoopUniforms(ElementCount VF);
1710 
1711   /// Collect the instructions that are scalar after vectorization. An
1712   /// instruction is scalar if it is known to be uniform or will be scalarized
1713   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1714   /// to the list if they are used by a load/store instruction that is marked as
1715   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1716   /// VF values in the vectorized loop, each corresponding to an iteration of
1717   /// the original scalar loop.
1718   void collectLoopScalars(ElementCount VF);
1719 
1720   /// Keeps cost model vectorization decision and cost for instructions.
1721   /// Right now it is used for memory instructions only.
1722   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1723                                 std::pair<InstWidening, InstructionCost>>;
1724 
1725   DecisionList WideningDecisions;
1726 
1727   using CallDecisionList =
1728       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1729 
1730   CallDecisionList CallWideningDecisions;
1731 
1732   /// Returns true if \p V is expected to be vectorized and it needs to be
1733   /// extracted.
1734   bool needsExtract(Value *V, ElementCount VF) const {
1735     Instruction *I = dyn_cast<Instruction>(V);
1736     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1737         TheLoop->isLoopInvariant(I) ||
1738         getWideningDecision(I, VF) == CM_Scalarize)
1739       return false;
1740 
1741     // Assume we can vectorize V (and hence we need extraction) if the
1742     // scalars are not computed yet. This can happen, because it is called
1743     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1744     // the scalars are collected. That should be a safe assumption in most
1745     // cases, because we check if the operands have vectorizable types
1746     // beforehand in LoopVectorizationLegality.
1747     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1748   };
1749 
1750   /// Returns a range containing only operands needing to be extracted.
1751   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1752                                                    ElementCount VF) const {
1753     return SmallVector<Value *, 4>(make_filter_range(
1754         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1755   }
1756 
1757 public:
1758   /// The loop that we evaluate.
1759   Loop *TheLoop;
1760 
1761   /// Predicated scalar evolution analysis.
1762   PredicatedScalarEvolution &PSE;
1763 
1764   /// Loop Info analysis.
1765   LoopInfo *LI;
1766 
1767   /// Vectorization legality.
1768   LoopVectorizationLegality *Legal;
1769 
1770   /// Vector target information.
1771   const TargetTransformInfo &TTI;
1772 
1773   /// Target Library Info.
1774   const TargetLibraryInfo *TLI;
1775 
1776   /// Demanded bits analysis.
1777   DemandedBits *DB;
1778 
1779   /// Assumption cache.
1780   AssumptionCache *AC;
1781 
1782   /// Interface to emit optimization remarks.
1783   OptimizationRemarkEmitter *ORE;
1784 
1785   const Function *TheFunction;
1786 
1787   /// Loop Vectorize Hint.
1788   const LoopVectorizeHints *Hints;
1789 
1790   /// The interleave access information contains groups of interleaved accesses
1791   /// with the same stride and close to each other.
1792   InterleavedAccessInfo &InterleaveInfo;
1793 
1794   /// Values to ignore in the cost model.
1795   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1796 
1797   /// Values to ignore in the cost model when VF > 1.
1798   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1799 
1800   /// All element types found in the loop.
1801   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1802 
1803   /// The kind of cost that we are calculating
1804   TTI::TargetCostKind CostKind;
1805 };
1806 } // end namespace llvm
1807 
1808 namespace {
1809 /// Helper struct to manage generating runtime checks for vectorization.
1810 ///
1811 /// The runtime checks are created up-front in temporary blocks to allow better
1812 /// estimating the cost and un-linked from the existing IR. After deciding to
1813 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1814 /// temporary blocks are completely removed.
1815 class GeneratedRTChecks {
1816   /// Basic block which contains the generated SCEV checks, if any.
1817   BasicBlock *SCEVCheckBlock = nullptr;
1818 
1819   /// The value representing the result of the generated SCEV checks. If it is
1820   /// nullptr, either no SCEV checks have been generated or they have been used.
1821   Value *SCEVCheckCond = nullptr;
1822 
1823   /// Basic block which contains the generated memory runtime checks, if any.
1824   BasicBlock *MemCheckBlock = nullptr;
1825 
1826   /// The value representing the result of the generated memory runtime checks.
1827   /// If it is nullptr, either no memory runtime checks have been generated or
1828   /// they have been used.
1829   Value *MemRuntimeCheckCond = nullptr;
1830 
1831   DominatorTree *DT;
1832   LoopInfo *LI;
1833   TargetTransformInfo *TTI;
1834 
1835   SCEVExpander SCEVExp;
1836   SCEVExpander MemCheckExp;
1837 
1838   bool CostTooHigh = false;
1839   const bool AddBranchWeights;
1840 
1841   Loop *OuterLoop = nullptr;
1842 
1843   PredicatedScalarEvolution &PSE;
1844 
1845   /// The kind of cost that we are calculating
1846   TTI::TargetCostKind CostKind;
1847 
1848 public:
1849   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1850                     LoopInfo *LI, TargetTransformInfo *TTI,
1851                     const DataLayout &DL, bool AddBranchWeights,
1852                     TTI::TargetCostKind CostKind)
1853       : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1854         MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1855         AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1856 
1857   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1858   /// accurately estimate the cost of the runtime checks. The blocks are
1859   /// un-linked from the IR and are added back during vector code generation. If
1860   /// there is no vector code generation, the check blocks are removed
1861   /// completely.
1862   void create(Loop *L, const LoopAccessInfo &LAI,
1863               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1864 
1865     // Hard cutoff to limit compile-time increase in case a very large number of
1866     // runtime checks needs to be generated.
1867     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1868     // profile info.
1869     CostTooHigh =
1870         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1871     if (CostTooHigh)
1872       return;
1873 
1874     BasicBlock *LoopHeader = L->getHeader();
1875     BasicBlock *Preheader = L->getLoopPreheader();
1876 
1877     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1878     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1879     // may be used by SCEVExpander. The blocks will be un-linked from their
1880     // predecessors and removed from LI & DT at the end of the function.
1881     if (!UnionPred.isAlwaysTrue()) {
1882       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1883                                   nullptr, "vector.scevcheck");
1884 
1885       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1886           &UnionPred, SCEVCheckBlock->getTerminator());
1887     }
1888 
1889     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1890     if (RtPtrChecking.Need) {
1891       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1892       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1893                                  "vector.memcheck");
1894 
1895       auto DiffChecks = RtPtrChecking.getDiffChecks();
1896       if (DiffChecks) {
1897         Value *RuntimeVF = nullptr;
1898         MemRuntimeCheckCond = addDiffRuntimeChecks(
1899             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1900             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1901               if (!RuntimeVF)
1902                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1903               return RuntimeVF;
1904             },
1905             IC);
1906       } else {
1907         MemRuntimeCheckCond = addRuntimeChecks(
1908             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1909             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1910       }
1911       assert(MemRuntimeCheckCond &&
1912              "no RT checks generated although RtPtrChecking "
1913              "claimed checks are required");
1914     }
1915 
1916     if (!MemCheckBlock && !SCEVCheckBlock)
1917       return;
1918 
1919     // Unhook the temporary block with the checks, update various places
1920     // accordingly.
1921     if (SCEVCheckBlock)
1922       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1923     if (MemCheckBlock)
1924       MemCheckBlock->replaceAllUsesWith(Preheader);
1925 
1926     if (SCEVCheckBlock) {
1927       SCEVCheckBlock->getTerminator()->moveBefore(
1928           Preheader->getTerminator()->getIterator());
1929       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1930       Preheader->getTerminator()->eraseFromParent();
1931     }
1932     if (MemCheckBlock) {
1933       MemCheckBlock->getTerminator()->moveBefore(
1934           Preheader->getTerminator()->getIterator());
1935       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1936       Preheader->getTerminator()->eraseFromParent();
1937     }
1938 
1939     DT->changeImmediateDominator(LoopHeader, Preheader);
1940     if (MemCheckBlock) {
1941       DT->eraseNode(MemCheckBlock);
1942       LI->removeBlock(MemCheckBlock);
1943     }
1944     if (SCEVCheckBlock) {
1945       DT->eraseNode(SCEVCheckBlock);
1946       LI->removeBlock(SCEVCheckBlock);
1947     }
1948 
1949     // Outer loop is used as part of the later cost calculations.
1950     OuterLoop = L->getParentLoop();
1951   }
1952 
1953   InstructionCost getCost() {
1954     if (SCEVCheckBlock || MemCheckBlock)
1955       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1956 
1957     if (CostTooHigh) {
1958       InstructionCost Cost;
1959       Cost.setInvalid();
1960       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1961       return Cost;
1962     }
1963 
1964     InstructionCost RTCheckCost = 0;
1965     if (SCEVCheckBlock)
1966       for (Instruction &I : *SCEVCheckBlock) {
1967         if (SCEVCheckBlock->getTerminator() == &I)
1968           continue;
1969         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1970         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1971         RTCheckCost += C;
1972       }
1973     if (MemCheckBlock) {
1974       InstructionCost MemCheckCost = 0;
1975       for (Instruction &I : *MemCheckBlock) {
1976         if (MemCheckBlock->getTerminator() == &I)
1977           continue;
1978         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1979         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1980         MemCheckCost += C;
1981       }
1982 
1983       // If the runtime memory checks are being created inside an outer loop
1984       // we should find out if these checks are outer loop invariant. If so,
1985       // the checks will likely be hoisted out and so the effective cost will
1986       // reduce according to the outer loop trip count.
1987       if (OuterLoop) {
1988         ScalarEvolution *SE = MemCheckExp.getSE();
1989         // TODO: If profitable, we could refine this further by analysing every
1990         // individual memory check, since there could be a mixture of loop
1991         // variant and invariant checks that mean the final condition is
1992         // variant.
1993         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1994         if (SE->isLoopInvariant(Cond, OuterLoop)) {
1995           // It seems reasonable to assume that we can reduce the effective
1996           // cost of the checks even when we know nothing about the trip
1997           // count. Assume that the outer loop executes at least twice.
1998           unsigned BestTripCount = 2;
1999 
2000           // Get the best known TC estimate.
2001           if (auto EstimatedTC = getSmallBestKnownTC(
2002                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
2003             BestTripCount = *EstimatedTC;
2004 
2005           BestTripCount = std::max(BestTripCount, 1U);
2006           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2007 
2008           // Let's ensure the cost is always at least 1.
2009           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2010                                      (InstructionCost::CostType)1);
2011 
2012           if (BestTripCount > 1)
2013             LLVM_DEBUG(dbgs()
2014                        << "We expect runtime memory checks to be hoisted "
2015                        << "out of the outer loop. Cost reduced from "
2016                        << MemCheckCost << " to " << NewMemCheckCost << '\n');
2017 
2018           MemCheckCost = NewMemCheckCost;
2019         }
2020       }
2021 
2022       RTCheckCost += MemCheckCost;
2023     }
2024 
2025     if (SCEVCheckBlock || MemCheckBlock)
2026       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2027                         << "\n");
2028 
2029     return RTCheckCost;
2030   }
2031 
2032   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2033   /// unused.
2034   ~GeneratedRTChecks() {
2035     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2036     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2037     if (!SCEVCheckCond)
2038       SCEVCleaner.markResultUsed();
2039 
2040     if (!MemRuntimeCheckCond)
2041       MemCheckCleaner.markResultUsed();
2042 
2043     if (MemRuntimeCheckCond) {
2044       auto &SE = *MemCheckExp.getSE();
2045       // Memory runtime check generation creates compares that use expanded
2046       // values. Remove them before running the SCEVExpanderCleaners.
2047       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2048         if (MemCheckExp.isInsertedInstruction(&I))
2049           continue;
2050         SE.forgetValue(&I);
2051         I.eraseFromParent();
2052       }
2053     }
2054     MemCheckCleaner.cleanup();
2055     SCEVCleaner.cleanup();
2056 
2057     if (SCEVCheckCond)
2058       SCEVCheckBlock->eraseFromParent();
2059     if (MemRuntimeCheckCond)
2060       MemCheckBlock->eraseFromParent();
2061   }
2062 
2063   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2064   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2065   /// depending on the generated condition.
2066   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2067                              BasicBlock *LoopVectorPreHeader) {
2068     if (!SCEVCheckCond)
2069       return nullptr;
2070 
2071     Value *Cond = SCEVCheckCond;
2072     // Mark the check as used, to prevent it from being removed during cleanup.
2073     SCEVCheckCond = nullptr;
2074     if (auto *C = dyn_cast<ConstantInt>(Cond))
2075       if (C->isZero())
2076         return nullptr;
2077 
2078     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2079 
2080     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2081     // Create new preheader for vector loop.
2082     if (OuterLoop)
2083       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2084 
2085     SCEVCheckBlock->getTerminator()->eraseFromParent();
2086     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2087     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2088                                                 SCEVCheckBlock);
2089 
2090     DT->addNewBlock(SCEVCheckBlock, Pred);
2091     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2092 
2093     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2094     if (AddBranchWeights)
2095       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2096     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2097     return SCEVCheckBlock;
2098   }
2099 
2100   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2101   /// the branches to branch to the vector preheader or \p Bypass, depending on
2102   /// the generated condition.
2103   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2104                                    BasicBlock *LoopVectorPreHeader) {
2105     // Check if we generated code that checks in runtime if arrays overlap.
2106     if (!MemRuntimeCheckCond)
2107       return nullptr;
2108 
2109     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2110     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2111                                                 MemCheckBlock);
2112 
2113     DT->addNewBlock(MemCheckBlock, Pred);
2114     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2115     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2116 
2117     if (OuterLoop)
2118       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2119 
2120     BranchInst &BI =
2121         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2122     if (AddBranchWeights) {
2123       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2124     }
2125     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2126     MemCheckBlock->getTerminator()->setDebugLoc(
2127         Pred->getTerminator()->getDebugLoc());
2128 
2129     // Mark the check as used, to prevent it from being removed during cleanup.
2130     MemRuntimeCheckCond = nullptr;
2131     return MemCheckBlock;
2132   }
2133 };
2134 } // namespace
2135 
2136 static bool useActiveLaneMask(TailFoldingStyle Style) {
2137   return Style == TailFoldingStyle::Data ||
2138          Style == TailFoldingStyle::DataAndControlFlow ||
2139          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2140 }
2141 
2142 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2143   return Style == TailFoldingStyle::DataAndControlFlow ||
2144          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2145 }
2146 
2147 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2148 // vectorization. The loop needs to be annotated with #pragma omp simd
2149 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2150 // vector length information is not provided, vectorization is not considered
2151 // explicit. Interleave hints are not allowed either. These limitations will be
2152 // relaxed in the future.
2153 // Please, note that we are currently forced to abuse the pragma 'clang
2154 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2155 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2156 // provides *explicit vectorization hints* (LV can bypass legal checks and
2157 // assume that vectorization is legal). However, both hints are implemented
2158 // using the same metadata (llvm.loop.vectorize, processed by
2159 // LoopVectorizeHints). This will be fixed in the future when the native IR
2160 // representation for pragma 'omp simd' is introduced.
2161 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2162                                    OptimizationRemarkEmitter *ORE) {
2163   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2164   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2165 
2166   // Only outer loops with an explicit vectorization hint are supported.
2167   // Unannotated outer loops are ignored.
2168   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2169     return false;
2170 
2171   Function *Fn = OuterLp->getHeader()->getParent();
2172   if (!Hints.allowVectorization(Fn, OuterLp,
2173                                 true /*VectorizeOnlyWhenForced*/)) {
2174     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2175     return false;
2176   }
2177 
2178   if (Hints.getInterleave() > 1) {
2179     // TODO: Interleave support is future work.
2180     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2181                          "outer loops.\n");
2182     Hints.emitRemarkWithHints();
2183     return false;
2184   }
2185 
2186   return true;
2187 }
2188 
2189 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2190                                   OptimizationRemarkEmitter *ORE,
2191                                   SmallVectorImpl<Loop *> &V) {
2192   // Collect inner loops and outer loops without irreducible control flow. For
2193   // now, only collect outer loops that have explicit vectorization hints. If we
2194   // are stress testing the VPlan H-CFG construction, we collect the outermost
2195   // loop of every loop nest.
2196   if (L.isInnermost() || VPlanBuildStressTest ||
2197       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2198     LoopBlocksRPO RPOT(&L);
2199     RPOT.perform(LI);
2200     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2201       V.push_back(&L);
2202       // TODO: Collect inner loops inside marked outer loops in case
2203       // vectorization fails for the outer loop. Do not invoke
2204       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2205       // already known to be reducible. We can use an inherited attribute for
2206       // that.
2207       return;
2208     }
2209   }
2210   for (Loop *InnerL : L)
2211     collectSupportedLoops(*InnerL, LI, ORE, V);
2212 }
2213 
2214 //===----------------------------------------------------------------------===//
2215 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2216 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2217 //===----------------------------------------------------------------------===//
2218 
2219 /// Compute the transformed value of Index at offset StartValue using step
2220 /// StepValue.
2221 /// For integer induction, returns StartValue + Index * StepValue.
2222 /// For pointer induction, returns StartValue[Index * StepValue].
2223 /// FIXME: The newly created binary instructions should contain nsw/nuw
2224 /// flags, which can be found from the original scalar operations.
2225 static Value *
2226 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2227                      Value *Step,
2228                      InductionDescriptor::InductionKind InductionKind,
2229                      const BinaryOperator *InductionBinOp) {
2230   Type *StepTy = Step->getType();
2231   Value *CastedIndex = StepTy->isIntegerTy()
2232                            ? B.CreateSExtOrTrunc(Index, StepTy)
2233                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2234   if (CastedIndex != Index) {
2235     CastedIndex->setName(CastedIndex->getName() + ".cast");
2236     Index = CastedIndex;
2237   }
2238 
2239   // Note: the IR at this point is broken. We cannot use SE to create any new
2240   // SCEV and then expand it, hoping that SCEV's simplification will give us
2241   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2242   // lead to various SCEV crashes. So all we can do is to use builder and rely
2243   // on InstCombine for future simplifications. Here we handle some trivial
2244   // cases only.
2245   auto CreateAdd = [&B](Value *X, Value *Y) {
2246     assert(X->getType() == Y->getType() && "Types don't match!");
2247     if (auto *CX = dyn_cast<ConstantInt>(X))
2248       if (CX->isZero())
2249         return Y;
2250     if (auto *CY = dyn_cast<ConstantInt>(Y))
2251       if (CY->isZero())
2252         return X;
2253     return B.CreateAdd(X, Y);
2254   };
2255 
2256   // We allow X to be a vector type, in which case Y will potentially be
2257   // splatted into a vector with the same element count.
2258   auto CreateMul = [&B](Value *X, Value *Y) {
2259     assert(X->getType()->getScalarType() == Y->getType() &&
2260            "Types don't match!");
2261     if (auto *CX = dyn_cast<ConstantInt>(X))
2262       if (CX->isOne())
2263         return Y;
2264     if (auto *CY = dyn_cast<ConstantInt>(Y))
2265       if (CY->isOne())
2266         return X;
2267     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2268     if (XVTy && !isa<VectorType>(Y->getType()))
2269       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2270     return B.CreateMul(X, Y);
2271   };
2272 
2273   switch (InductionKind) {
2274   case InductionDescriptor::IK_IntInduction: {
2275     assert(!isa<VectorType>(Index->getType()) &&
2276            "Vector indices not supported for integer inductions yet");
2277     assert(Index->getType() == StartValue->getType() &&
2278            "Index type does not match StartValue type");
2279     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2280       return B.CreateSub(StartValue, Index);
2281     auto *Offset = CreateMul(Index, Step);
2282     return CreateAdd(StartValue, Offset);
2283   }
2284   case InductionDescriptor::IK_PtrInduction:
2285     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2286   case InductionDescriptor::IK_FpInduction: {
2287     assert(!isa<VectorType>(Index->getType()) &&
2288            "Vector indices not supported for FP inductions yet");
2289     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2290     assert(InductionBinOp &&
2291            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2292             InductionBinOp->getOpcode() == Instruction::FSub) &&
2293            "Original bin op should be defined for FP induction");
2294 
2295     Value *MulExp = B.CreateFMul(Step, Index);
2296     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2297                          "induction");
2298   }
2299   case InductionDescriptor::IK_NoInduction:
2300     return nullptr;
2301   }
2302   llvm_unreachable("invalid enum");
2303 }
2304 
2305 std::optional<unsigned> getMaxVScale(const Function &F,
2306                                      const TargetTransformInfo &TTI) {
2307   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2308     return MaxVScale;
2309 
2310   if (F.hasFnAttribute(Attribute::VScaleRange))
2311     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2312 
2313   return std::nullopt;
2314 }
2315 
2316 /// For the given VF and UF and maximum trip count computed for the loop, return
2317 /// whether the induction variable might overflow in the vectorized loop. If not,
2318 /// then we know a runtime overflow check always evaluates to false and can be
2319 /// removed.
2320 static bool isIndvarOverflowCheckKnownFalse(
2321     const LoopVectorizationCostModel *Cost,
2322     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2323   // Always be conservative if we don't know the exact unroll factor.
2324   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2325 
2326   Type *IdxTy = Cost->Legal->getWidestInductionType();
2327   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2328 
2329   // We know the runtime overflow check is known false iff the (max) trip-count
2330   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2331   // the vector loop induction variable.
2332   if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2333     uint64_t MaxVF = VF.getKnownMinValue();
2334     if (VF.isScalable()) {
2335       std::optional<unsigned> MaxVScale =
2336           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2337       if (!MaxVScale)
2338         return false;
2339       MaxVF *= *MaxVScale;
2340     }
2341 
2342     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2343   }
2344 
2345   return false;
2346 }
2347 
2348 // Return whether we allow using masked interleave-groups (for dealing with
2349 // strided loads/stores that reside in predicated blocks, or for dealing
2350 // with gaps).
2351 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2352   // If an override option has been passed in for interleaved accesses, use it.
2353   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2354     return EnableMaskedInterleavedMemAccesses;
2355 
2356   return TTI.enableMaskedInterleavedAccessVectorization();
2357 }
2358 
2359 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2360                                                VPReplicateRecipe *RepRecipe,
2361                                                const VPLane &Lane,
2362                                                VPTransformState &State) {
2363   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2364 
2365   // Does this instruction return a value ?
2366   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2367 
2368   Instruction *Cloned = Instr->clone();
2369   if (!IsVoidRetTy) {
2370     Cloned->setName(Instr->getName() + ".cloned");
2371 #if !defined(NDEBUG)
2372     // Verify that VPlan type inference results agree with the type of the
2373     // generated values.
2374     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2375            "inferred type and type from generated instructions do not match");
2376 #endif
2377   }
2378 
2379   RepRecipe->setFlags(Cloned);
2380 
2381   if (auto DL = Instr->getDebugLoc())
2382     State.setDebugLocFrom(DL);
2383 
2384   // Replace the operands of the cloned instructions with their scalar
2385   // equivalents in the new loop.
2386   for (const auto &I : enumerate(RepRecipe->operands())) {
2387     auto InputLane = Lane;
2388     VPValue *Operand = I.value();
2389     if (vputils::isUniformAfterVectorization(Operand))
2390       InputLane = VPLane::getFirstLane();
2391     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2392   }
2393   State.addNewMetadata(Cloned, Instr);
2394 
2395   // Place the cloned scalar in the new loop.
2396   State.Builder.Insert(Cloned);
2397 
2398   State.set(RepRecipe, Cloned, Lane);
2399 
2400   // If we just cloned a new assumption, add it the assumption cache.
2401   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2402     AC->registerAssumption(II);
2403 
2404   // End if-block.
2405   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2406   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2407   assert(
2408       (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2409        all_of(RepRecipe->operands(),
2410               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2411       "Expected a recipe is either within a region or all of its operands "
2412       "are defined outside the vectorized region.");
2413   if (IfPredicateInstr)
2414     PredicatedInstructions.push_back(Cloned);
2415 }
2416 
2417 Value *
2418 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2419   if (VectorTripCount)
2420     return VectorTripCount;
2421 
2422   Value *TC = getTripCount();
2423   IRBuilder<> Builder(InsertBlock->getTerminator());
2424 
2425   Type *Ty = TC->getType();
2426   // This is where we can make the step a runtime constant.
2427   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2428 
2429   // If the tail is to be folded by masking, round the number of iterations N
2430   // up to a multiple of Step instead of rounding down. This is done by first
2431   // adding Step-1 and then rounding down. Note that it's ok if this addition
2432   // overflows: the vector induction variable will eventually wrap to zero given
2433   // that it starts at zero and its Step is a power of two; the loop will then
2434   // exit, with the last early-exit vector comparison also producing all-true.
2435   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2436   // is accounted for in emitIterationCountCheck that adds an overflow check.
2437   if (Cost->foldTailByMasking()) {
2438     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2439            "VF*UF must be a power of 2 when folding tail by masking");
2440     TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2441                            "n.rnd.up");
2442   }
2443 
2444   // Now we need to generate the expression for the part of the loop that the
2445   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2446   // iterations are not required for correctness, or N - Step, otherwise. Step
2447   // is equal to the vectorization factor (number of SIMD elements) times the
2448   // unroll factor (number of SIMD instructions).
2449   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2450 
2451   // There are cases where we *must* run at least one iteration in the remainder
2452   // loop.  See the cost model for when this can happen.  If the step evenly
2453   // divides the trip count, we set the remainder to be equal to the step. If
2454   // the step does not evenly divide the trip count, no adjustment is necessary
2455   // since there will already be scalar iterations. Note that the minimum
2456   // iterations check ensures that N >= Step.
2457   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2458     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2459     R = Builder.CreateSelect(IsZero, Step, R);
2460   }
2461 
2462   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2463 
2464   return VectorTripCount;
2465 }
2466 
2467 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2468   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2469   VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2470   if (PreVectorPH->getNumSuccessors() != 1) {
2471     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2472     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2473            "Unexpected successor");
2474     VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2475     VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2476     PreVectorPH = CheckVPIRBB;
2477   }
2478   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2479   PreVectorPH->swapSuccessors();
2480 }
2481 
2482 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2483   Value *Count = getTripCount();
2484   // Reuse existing vector loop preheader for TC checks.
2485   // Note that new preheader block is generated for vector loop.
2486   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2487   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2488 
2489   // Generate code to check if the loop's trip count is less than VF * UF, or
2490   // equal to it in case a scalar epilogue is required; this implies that the
2491   // vector trip count is zero. This check also covers the case where adding one
2492   // to the backedge-taken count overflowed leading to an incorrect trip count
2493   // of zero. In this case we will also jump to the scalar loop.
2494   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2495                                                        : ICmpInst::ICMP_ULT;
2496 
2497   // If tail is to be folded, vector loop takes care of all iterations.
2498   Type *CountTy = Count->getType();
2499   Value *CheckMinIters = Builder.getFalse();
2500   auto CreateStep = [&]() -> Value * {
2501     // Create step with max(MinProTripCount, UF * VF).
2502     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2503       return createStepForVF(Builder, CountTy, VF, UF);
2504 
2505     Value *MinProfTC =
2506         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2507     if (!VF.isScalable())
2508       return MinProfTC;
2509     return Builder.CreateBinaryIntrinsic(
2510         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2511   };
2512 
2513   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2514   if (Style == TailFoldingStyle::None) {
2515     Value *Step = CreateStep();
2516     ScalarEvolution &SE = *PSE.getSE();
2517     // TODO: Emit unconditional branch to vector preheader instead of
2518     // conditional branch with known condition.
2519     const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2520     // Check if the trip count is < the step.
2521     if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2522       // TODO: Ensure step is at most the trip count when determining max VF and
2523       // UF, w/o tail folding.
2524       CheckMinIters = Builder.getTrue();
2525     } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),
2526                                     TripCountSCEV, SE.getSCEV(Step))) {
2527       // Generate the minimum iteration check only if we cannot prove the
2528       // check is known to be true, or known to be false.
2529       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2530     } // else step known to be < trip count, use CheckMinIters preset to false.
2531   } else if (VF.isScalable() &&
2532              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2533              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2534     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2535     // an overflow to zero when updating induction variables and so an
2536     // additional overflow check is required before entering the vector loop.
2537 
2538     // Get the maximum unsigned value for the type.
2539     Value *MaxUIntTripCount =
2540         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2541     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2542 
2543     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2544     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2545   }
2546 
2547   // Create new preheader for vector loop.
2548   LoopVectorPreHeader =
2549       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2550                  "vector.ph");
2551 
2552   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2553                                DT->getNode(Bypass)->getIDom()) &&
2554          "TC check is expected to dominate Bypass");
2555 
2556   BranchInst &BI =
2557       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2558   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2559     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2560   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2561   LoopBypassBlocks.push_back(TCCheckBlock);
2562 
2563   // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2564   introduceCheckBlockInVPlan(TCCheckBlock);
2565 }
2566 
2567 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2568   BasicBlock *const SCEVCheckBlock =
2569       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2570   if (!SCEVCheckBlock)
2571     return nullptr;
2572 
2573   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2574            (OptForSizeBasedOnProfile &&
2575             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2576          "Cannot SCEV check stride or overflow when optimizing for size");
2577   assert(!LoopBypassBlocks.empty() &&
2578          "Should already be a bypass block due to iteration count check");
2579   LoopBypassBlocks.push_back(SCEVCheckBlock);
2580   AddedSafetyChecks = true;
2581 
2582   introduceCheckBlockInVPlan(SCEVCheckBlock);
2583   return SCEVCheckBlock;
2584 }
2585 
2586 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2587   // VPlan-native path does not do any analysis for runtime checks currently.
2588   if (EnableVPlanNativePath)
2589     return nullptr;
2590 
2591   BasicBlock *const MemCheckBlock =
2592       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2593 
2594   // Check if we generated code that checks in runtime if arrays overlap. We put
2595   // the checks into a separate block to make the more common case of few
2596   // elements faster.
2597   if (!MemCheckBlock)
2598     return nullptr;
2599 
2600   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2601     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2602            "Cannot emit memory checks when optimizing for size, unless forced "
2603            "to vectorize.");
2604     ORE->emit([&]() {
2605       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2606                                         OrigLoop->getStartLoc(),
2607                                         OrigLoop->getHeader())
2608              << "Code-size may be reduced by not forcing "
2609                 "vectorization, or by source-code modifications "
2610                 "eliminating the need for runtime checks "
2611                 "(e.g., adding 'restrict').";
2612     });
2613   }
2614 
2615   LoopBypassBlocks.push_back(MemCheckBlock);
2616 
2617   AddedSafetyChecks = true;
2618 
2619   introduceCheckBlockInVPlan(MemCheckBlock);
2620   return MemCheckBlock;
2621 }
2622 
2623 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2624 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2625 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2626 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2627 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2628   VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2629   for (auto &R : make_early_inc_range(*VPBB)) {
2630     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2631     R.moveBefore(*IRVPBB, IRVPBB->end());
2632   }
2633 
2634   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2635   // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2636 }
2637 
2638 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2639   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2640   assert(LoopVectorPreHeader && "Invalid loop structure");
2641   assert((OrigLoop->getUniqueLatchExitBlock() ||
2642           Cost->requiresScalarEpilogue(VF.isVector())) &&
2643          "loops not exiting via the latch without required epilogue?");
2644 
2645   LoopMiddleBlock =
2646       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2647                  LI, nullptr, Twine(Prefix) + "middle.block");
2648   replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
2649   LoopScalarPreHeader =
2650       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2651                  nullptr, Twine(Prefix) + "scalar.ph");
2652   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
2653 }
2654 
2655 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2656 /// expansion results.
2657 static Value *getExpandedStep(const InductionDescriptor &ID,
2658                               const SCEV2ValueTy &ExpandedSCEVs) {
2659   const SCEV *Step = ID.getStep();
2660   if (auto *C = dyn_cast<SCEVConstant>(Step))
2661     return C->getValue();
2662   if (auto *U = dyn_cast<SCEVUnknown>(Step))
2663     return U->getValue();
2664   auto I = ExpandedSCEVs.find(Step);
2665   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2666   return I->second;
2667 }
2668 
2669 /// Knowing that loop \p L executes a single vector iteration, add instructions
2670 /// that will get simplified and thus should not have any cost to \p
2671 /// InstsToIgnore.
2672 static void addFullyUnrolledInstructionsToIgnore(
2673     Loop *L, const LoopVectorizationLegality::InductionList &IL,
2674     SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2675   auto *Cmp = L->getLatchCmpInst();
2676   if (Cmp)
2677     InstsToIgnore.insert(Cmp);
2678   for (const auto &KV : IL) {
2679     // Extract the key by hand so that it can be used in the lambda below.  Note
2680     // that captured structured bindings are a C++20 extension.
2681     const PHINode *IV = KV.first;
2682 
2683     // Get next iteration value of the induction variable.
2684     Instruction *IVInst =
2685         cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2686     if (all_of(IVInst->users(),
2687                [&](const User *U) { return U == IV || U == Cmp; }))
2688       InstsToIgnore.insert(IVInst);
2689   }
2690 }
2691 
2692 void InnerLoopVectorizer::createInductionAdditionalBypassValues(
2693     const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2694   assert(MainVectorTripCount && "Must have bypass information");
2695 
2696   Instruction *OldInduction = Legal->getPrimaryInduction();
2697   IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2698                             getAdditionalBypassBlock()->getFirstInsertionPt());
2699   for (const auto &InductionEntry : Legal->getInductionVars()) {
2700     PHINode *OrigPhi = InductionEntry.first;
2701     const InductionDescriptor &II = InductionEntry.second;
2702     Value *Step = getExpandedStep(II, ExpandedSCEVs);
2703     // For the primary induction the additional bypass end value is known.
2704     // Otherwise it is computed.
2705     Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2706     if (OrigPhi != OldInduction) {
2707       auto *BinOp = II.getInductionBinOp();
2708       // Fast-math-flags propagate from the original induction instruction.
2709       if (isa_and_nonnull<FPMathOperator>(BinOp))
2710         BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2711 
2712       // Compute the end value for the additional bypass.
2713       EndValueFromAdditionalBypass =
2714           emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2715                                II.getStartValue(), Step, II.getKind(), BinOp);
2716       EndValueFromAdditionalBypass->setName("ind.end");
2717     }
2718 
2719     // Store the bypass value here, as it needs to be added as operand to its
2720     // scalar preheader phi node after the epilogue skeleton has been created.
2721     // TODO: Directly add as extra operand to the VPResumePHI recipe.
2722     assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2723            "entry for OrigPhi already exits");
2724     Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2725   }
2726 }
2727 
2728 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2729     const SCEV2ValueTy &ExpandedSCEVs) {
2730   /*
2731    In this function we generate a new loop. The new loop will contain
2732    the vectorized instructions while the old loop will continue to run the
2733    scalar remainder.
2734 
2735        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2736      /  |      preheader are expanded here. Eventually all required SCEV
2737     /   |      expansion should happen here.
2738    /    v
2739   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2740   |  /  |
2741   | /   v
2742   ||   [ ]     <-- vector pre header.
2743   |/    |
2744   |     v
2745   |    [  ] \
2746   |    [  ]_|   <-- vector loop (created during VPlan execution).
2747   |     |
2748   |     v
2749   \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2750    |    |                       successors created during VPlan execution)
2751    \/   |
2752    /\   v
2753    | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
2754    |    |
2755  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
2756    |   [ ] \
2757    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue, header
2758    |    |          wrapped in VPIRBasicBlock).
2759     \   |
2760      \  v
2761       >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
2762    ...
2763    */
2764 
2765   // Create an empty vector loop, and prepare basic blocks for the runtime
2766   // checks.
2767   createVectorLoopSkeleton("");
2768 
2769   // Now, compare the new count to zero. If it is zero skip the vector loop and
2770   // jump to the scalar loop. This check also covers the case where the
2771   // backedge-taken count is uint##_max: adding one to it will overflow leading
2772   // to an incorrect trip count of zero. In this (rare) case we will also jump
2773   // to the scalar loop.
2774   emitIterationCountCheck(LoopScalarPreHeader);
2775 
2776   // Generate the code to check any assumptions that we've made for SCEV
2777   // expressions.
2778   emitSCEVChecks(LoopScalarPreHeader);
2779 
2780   // Generate the code that checks in runtime if arrays overlap. We put the
2781   // checks into a separate block to make the more common case of few elements
2782   // faster.
2783   emitMemRuntimeChecks(LoopScalarPreHeader);
2784 
2785   return LoopVectorPreHeader;
2786 }
2787 
2788 namespace {
2789 
2790 struct CSEDenseMapInfo {
2791   static bool canHandle(const Instruction *I) {
2792     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2793            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2794   }
2795 
2796   static inline Instruction *getEmptyKey() {
2797     return DenseMapInfo<Instruction *>::getEmptyKey();
2798   }
2799 
2800   static inline Instruction *getTombstoneKey() {
2801     return DenseMapInfo<Instruction *>::getTombstoneKey();
2802   }
2803 
2804   static unsigned getHashValue(const Instruction *I) {
2805     assert(canHandle(I) && "Unknown instruction!");
2806     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2807                                                            I->value_op_end()));
2808   }
2809 
2810   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2811     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2812         LHS == getTombstoneKey() || RHS == getTombstoneKey())
2813       return LHS == RHS;
2814     return LHS->isIdenticalTo(RHS);
2815   }
2816 };
2817 
2818 } // end anonymous namespace
2819 
2820 ///Perform cse of induction variable instructions.
2821 static void cse(BasicBlock *BB) {
2822   // Perform simple cse.
2823   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2824   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2825     if (!CSEDenseMapInfo::canHandle(&In))
2826       continue;
2827 
2828     // Check if we can replace this instruction with any of the
2829     // visited instructions.
2830     if (Instruction *V = CSEMap.lookup(&In)) {
2831       In.replaceAllUsesWith(V);
2832       In.eraseFromParent();
2833       continue;
2834     }
2835 
2836     CSEMap[&In] = &In;
2837   }
2838 }
2839 
2840 InstructionCost
2841 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2842                                               ElementCount VF) const {
2843   // We only need to calculate a cost if the VF is scalar; for actual vectors
2844   // we should already have a pre-calculated cost at each VF.
2845   if (!VF.isScalar())
2846     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2847 
2848   Type *RetTy = CI->getType();
2849   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2850     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2851       return *RedCost;
2852 
2853   SmallVector<Type *, 4> Tys;
2854   for (auto &ArgOp : CI->args())
2855     Tys.push_back(ArgOp->getType());
2856 
2857   InstructionCost ScalarCallCost =
2858       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2859 
2860   // If this is an intrinsic we may have a lower cost for it.
2861   if (getVectorIntrinsicIDForCall(CI, TLI)) {
2862     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2863     return std::min(ScalarCallCost, IntrinsicCost);
2864   }
2865   return ScalarCallCost;
2866 }
2867 
2868 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2869   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2870     return Elt;
2871   return VectorType::get(Elt, VF);
2872 }
2873 
2874 InstructionCost
2875 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2876                                                    ElementCount VF) const {
2877   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2878   assert(ID && "Expected intrinsic call!");
2879   Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2880   FastMathFlags FMF;
2881   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2882     FMF = FPMO->getFastMathFlags();
2883 
2884   SmallVector<const Value *> Arguments(CI->args());
2885   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2886   SmallVector<Type *> ParamTys;
2887   std::transform(FTy->param_begin(), FTy->param_end(),
2888                  std::back_inserter(ParamTys),
2889                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2890 
2891   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2892                                     dyn_cast<IntrinsicInst>(CI));
2893   return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2894 }
2895 
2896 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2897   // Fix widened non-induction PHIs by setting up the PHI operands.
2898   if (EnableVPlanNativePath)
2899     fixNonInductionPHIs(State);
2900 
2901   // Forget the original basic block.
2902   PSE.getSE()->forgetLoop(OrigLoop);
2903   PSE.getSE()->forgetBlockAndLoopDispositions();
2904 
2905   // After vectorization, the exit blocks of the original loop will have
2906   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2907   // looked through single-entry phis.
2908   SmallVector<BasicBlock *> ExitBlocks;
2909   OrigLoop->getExitBlocks(ExitBlocks);
2910   for (BasicBlock *Exit : ExitBlocks)
2911     for (PHINode &PN : Exit->phis())
2912       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2913 
2914   // Don't apply optimizations below when no vector region remains, as they all
2915   // require a vector loop at the moment.
2916   if (!State.Plan->getVectorLoopRegion())
2917     return;
2918 
2919   for (Instruction *PI : PredicatedInstructions)
2920     sinkScalarOperands(&*PI);
2921 
2922   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2923   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
2924   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2925 
2926   // Remove redundant induction instructions.
2927   cse(HeaderBB);
2928 
2929   // Set/update profile weights for the vector and remainder loops as original
2930   // loop iterations are now distributed among them. Note that original loop
2931   // becomes the scalar remainder loop after vectorization.
2932   //
2933   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2934   // end up getting slightly roughened result but that should be OK since
2935   // profile is not inherently precise anyway. Note also possible bypass of
2936   // vector code caused by legality checks is ignored, assigning all the weight
2937   // to the vector loop, optimistically.
2938   //
2939   // For scalable vectorization we can't know at compile time how many
2940   // iterations of the loop are handled in one vector iteration, so instead
2941   // assume a pessimistic vscale of '1'.
2942   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2943   setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
2944                                VF.getKnownMinValue() * UF);
2945 }
2946 
2947 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
2948   // The basic block and loop containing the predicated instruction.
2949   auto *PredBB = PredInst->getParent();
2950   auto *VectorLoop = LI->getLoopFor(PredBB);
2951 
2952   // Initialize a worklist with the operands of the predicated instruction.
2953   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
2954 
2955   // Holds instructions that we need to analyze again. An instruction may be
2956   // reanalyzed if we don't yet know if we can sink it or not.
2957   SmallVector<Instruction *, 8> InstsToReanalyze;
2958 
2959   // Returns true if a given use occurs in the predicated block. Phi nodes use
2960   // their operands in their corresponding predecessor blocks.
2961   auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
2962     auto *I = cast<Instruction>(U.getUser());
2963     BasicBlock *BB = I->getParent();
2964     if (auto *Phi = dyn_cast<PHINode>(I))
2965       BB = Phi->getIncomingBlock(
2966           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
2967     return BB == PredBB;
2968   };
2969 
2970   // Iteratively sink the scalarized operands of the predicated instruction
2971   // into the block we created for it. When an instruction is sunk, it's
2972   // operands are then added to the worklist. The algorithm ends after one pass
2973   // through the worklist doesn't sink a single instruction.
2974   bool Changed;
2975   do {
2976     // Add the instructions that need to be reanalyzed to the worklist, and
2977     // reset the changed indicator.
2978     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
2979     InstsToReanalyze.clear();
2980     Changed = false;
2981 
2982     while (!Worklist.empty()) {
2983       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
2984 
2985       // We can't sink an instruction if it is a phi node, is not in the loop,
2986       // may have side effects or may read from memory.
2987       // TODO: Could do more granular checking to allow sinking
2988       // a load past non-store instructions.
2989       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
2990           I->mayHaveSideEffects() || I->mayReadFromMemory())
2991           continue;
2992 
2993       // If the instruction is already in PredBB, check if we can sink its
2994       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
2995       // sinking the scalar instruction I, hence it appears in PredBB; but it
2996       // may have failed to sink I's operands (recursively), which we try
2997       // (again) here.
2998       if (I->getParent() == PredBB) {
2999         Worklist.insert(I->op_begin(), I->op_end());
3000         continue;
3001       }
3002 
3003       // It's legal to sink the instruction if all its uses occur in the
3004       // predicated block. Otherwise, there's nothing to do yet, and we may
3005       // need to reanalyze the instruction.
3006       if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
3007         InstsToReanalyze.push_back(I);
3008         continue;
3009       }
3010 
3011       // Move the instruction to the beginning of the predicated block, and add
3012       // it's operands to the worklist.
3013       I->moveBefore(PredBB->getFirstInsertionPt());
3014       Worklist.insert(I->op_begin(), I->op_end());
3015 
3016       // The sinking may have enabled other instructions to be sunk, so we will
3017       // need to iterate.
3018       Changed = true;
3019     }
3020   } while (Changed);
3021 }
3022 
3023 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
3024   auto Iter = vp_depth_first_deep(Plan.getEntry());
3025   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3026     for (VPRecipeBase &P : VPBB->phis()) {
3027       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3028       if (!VPPhi)
3029         continue;
3030       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3031       // Make sure the builder has a valid insert point.
3032       Builder.SetInsertPoint(NewPhi);
3033       for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3034         VPValue *Inc = VPPhi->getIncomingValue(Idx);
3035         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3036         NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3037       }
3038     }
3039   }
3040 }
3041 
3042 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3043   // We should not collect Scalars more than once per VF. Right now, this
3044   // function is called from collectUniformsAndScalars(), which already does
3045   // this check. Collecting Scalars for VF=1 does not make any sense.
3046   assert(VF.isVector() && !Scalars.contains(VF) &&
3047          "This function should not be visited twice for the same VF");
3048 
3049   // This avoids any chances of creating a REPLICATE recipe during planning
3050   // since that would result in generation of scalarized code during execution,
3051   // which is not supported for scalable vectors.
3052   if (VF.isScalable()) {
3053     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3054     return;
3055   }
3056 
3057   SmallSetVector<Instruction *, 8> Worklist;
3058 
3059   // These sets are used to seed the analysis with pointers used by memory
3060   // accesses that will remain scalar.
3061   SmallSetVector<Instruction *, 8> ScalarPtrs;
3062   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3063   auto *Latch = TheLoop->getLoopLatch();
3064 
3065   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3066   // The pointer operands of loads and stores will be scalar as long as the
3067   // memory access is not a gather or scatter operation. The value operand of a
3068   // store will remain scalar if the store is scalarized.
3069   auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3070     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3071     assert(WideningDecision != CM_Unknown &&
3072            "Widening decision should be ready at this moment");
3073     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3074       if (Ptr == Store->getValueOperand())
3075         return WideningDecision == CM_Scalarize;
3076     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3077            "Ptr is neither a value or pointer operand");
3078     return WideningDecision != CM_GatherScatter;
3079   };
3080 
3081   // A helper that returns true if the given value is a getelementptr
3082   // instruction contained in the loop.
3083   auto IsLoopVaryingGEP = [&](Value *V) {
3084     return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3085   };
3086 
3087   // A helper that evaluates a memory access's use of a pointer. If the use will
3088   // be a scalar use and the pointer is only used by memory accesses, we place
3089   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3090   // PossibleNonScalarPtrs.
3091   auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3092     // We only care about bitcast and getelementptr instructions contained in
3093     // the loop.
3094     if (!IsLoopVaryingGEP(Ptr))
3095       return;
3096 
3097     // If the pointer has already been identified as scalar (e.g., if it was
3098     // also identified as uniform), there's nothing to do.
3099     auto *I = cast<Instruction>(Ptr);
3100     if (Worklist.count(I))
3101       return;
3102 
3103     // If the use of the pointer will be a scalar use, and all users of the
3104     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3105     // place the pointer in PossibleNonScalarPtrs.
3106     if (IsScalarUse(MemAccess, Ptr) &&
3107         all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3108       ScalarPtrs.insert(I);
3109     else
3110       PossibleNonScalarPtrs.insert(I);
3111   };
3112 
3113   // We seed the scalars analysis with three classes of instructions: (1)
3114   // instructions marked uniform-after-vectorization and (2) bitcast,
3115   // getelementptr and (pointer) phi instructions used by memory accesses
3116   // requiring a scalar use.
3117   //
3118   // (1) Add to the worklist all instructions that have been identified as
3119   // uniform-after-vectorization.
3120   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3121 
3122   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3123   // memory accesses requiring a scalar use. The pointer operands of loads and
3124   // stores will be scalar unless the operation is a gather or scatter.
3125   // The value operand of a store will remain scalar if the store is scalarized.
3126   for (auto *BB : TheLoop->blocks())
3127     for (auto &I : *BB) {
3128       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3129         EvaluatePtrUse(Load, Load->getPointerOperand());
3130       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3131         EvaluatePtrUse(Store, Store->getPointerOperand());
3132         EvaluatePtrUse(Store, Store->getValueOperand());
3133       }
3134     }
3135   for (auto *I : ScalarPtrs)
3136     if (!PossibleNonScalarPtrs.count(I)) {
3137       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3138       Worklist.insert(I);
3139     }
3140 
3141   // Insert the forced scalars.
3142   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3143   // induction variable when the PHI user is scalarized.
3144   auto ForcedScalar = ForcedScalars.find(VF);
3145   if (ForcedScalar != ForcedScalars.end())
3146     for (auto *I : ForcedScalar->second) {
3147       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3148       Worklist.insert(I);
3149     }
3150 
3151   // Expand the worklist by looking through any bitcasts and getelementptr
3152   // instructions we've already identified as scalar. This is similar to the
3153   // expansion step in collectLoopUniforms(); however, here we're only
3154   // expanding to include additional bitcasts and getelementptr instructions.
3155   unsigned Idx = 0;
3156   while (Idx != Worklist.size()) {
3157     Instruction *Dst = Worklist[Idx++];
3158     if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3159       continue;
3160     auto *Src = cast<Instruction>(Dst->getOperand(0));
3161     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3162           auto *J = cast<Instruction>(U);
3163           return !TheLoop->contains(J) || Worklist.count(J) ||
3164                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3165                   IsScalarUse(J, Src));
3166         })) {
3167       Worklist.insert(Src);
3168       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3169     }
3170   }
3171 
3172   // An induction variable will remain scalar if all users of the induction
3173   // variable and induction variable update remain scalar.
3174   for (const auto &Induction : Legal->getInductionVars()) {
3175     auto *Ind = Induction.first;
3176     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3177 
3178     // If tail-folding is applied, the primary induction variable will be used
3179     // to feed a vector compare.
3180     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3181       continue;
3182 
3183     // Returns true if \p Indvar is a pointer induction that is used directly by
3184     // load/store instruction \p I.
3185     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3186                                               Instruction *I) {
3187       return Induction.second.getKind() ==
3188                  InductionDescriptor::IK_PtrInduction &&
3189              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3190              Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3191     };
3192 
3193     // Determine if all users of the induction variable are scalar after
3194     // vectorization.
3195     bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3196       auto *I = cast<Instruction>(U);
3197       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3198              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3199     });
3200     if (!ScalarInd)
3201       continue;
3202 
3203     // If the induction variable update is a fixed-order recurrence, neither the
3204     // induction variable or its update should be marked scalar after
3205     // vectorization.
3206     auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3207     if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3208       continue;
3209 
3210     // Determine if all users of the induction variable update instruction are
3211     // scalar after vectorization.
3212     bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3213       auto *I = cast<Instruction>(U);
3214       return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3215              IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3216     });
3217     if (!ScalarIndUpdate)
3218       continue;
3219 
3220     // The induction variable and its update instruction will remain scalar.
3221     Worklist.insert(Ind);
3222     Worklist.insert(IndUpdate);
3223     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3224     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3225                       << "\n");
3226   }
3227 
3228   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3229 }
3230 
3231 bool LoopVectorizationCostModel::isScalarWithPredication(
3232     Instruction *I, ElementCount VF) const {
3233   if (!isPredicatedInst(I))
3234     return false;
3235 
3236   // Do we have a non-scalar lowering for this predicated
3237   // instruction? No - it is scalar with predication.
3238   switch(I->getOpcode()) {
3239   default:
3240     return true;
3241   case Instruction::Call:
3242     if (VF.isScalar())
3243       return true;
3244     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3245                .Kind == CM_Scalarize;
3246   case Instruction::Load:
3247   case Instruction::Store: {
3248     auto *Ptr = getLoadStorePointerOperand(I);
3249     auto *Ty = getLoadStoreType(I);
3250     Type *VTy = Ty;
3251     if (VF.isVector())
3252       VTy = VectorType::get(Ty, VF);
3253     const Align Alignment = getLoadStoreAlignment(I);
3254     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3255                                 TTI.isLegalMaskedGather(VTy, Alignment))
3256                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3257                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3258   }
3259   case Instruction::UDiv:
3260   case Instruction::SDiv:
3261   case Instruction::SRem:
3262   case Instruction::URem: {
3263     // We have the option to use the safe-divisor idiom to avoid predication.
3264     // The cost based decision here will always select safe-divisor for
3265     // scalable vectors as scalarization isn't legal.
3266     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3267     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3268   }
3269   }
3270 }
3271 
3272 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3273 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3274   // If predication is not needed, avoid it.
3275   // TODO: We can use the loop-preheader as context point here and get
3276   // context sensitive reasoning for isSafeToSpeculativelyExecute.
3277   if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3278       isSafeToSpeculativelyExecute(I) ||
3279       (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3280       isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3281     return false;
3282 
3283   // If the instruction was executed conditionally in the original scalar loop,
3284   // predication is needed with a mask whose lanes are all possibly inactive.
3285   if (Legal->blockNeedsPredication(I->getParent()))
3286     return true;
3287 
3288   // All that remain are instructions with side-effects originally executed in
3289   // the loop unconditionally, but now execute under a tail-fold mask (only)
3290   // having at least one active lane (the first). If the side-effects of the
3291   // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3292   // - it will cause the same side-effects as when masked.
3293   switch(I->getOpcode()) {
3294   default:
3295     llvm_unreachable(
3296         "instruction should have been considered by earlier checks");
3297   case Instruction::Call:
3298     // Side-effects of a Call are assumed to be non-invariant, needing a
3299     // (fold-tail) mask.
3300     assert(Legal->isMaskRequired(I) &&
3301            "should have returned earlier for calls not needing a mask");
3302     return true;
3303   case Instruction::Load:
3304     // If the address is loop invariant no predication is needed.
3305     return !Legal->isInvariant(getLoadStorePointerOperand(I));
3306   case Instruction::Store: {
3307     // For stores, we need to prove both speculation safety (which follows from
3308     // the same argument as loads), but also must prove the value being stored
3309     // is correct.  The easiest form of the later is to require that all values
3310     // stored are the same.
3311     return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3312              TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3313   }
3314   case Instruction::UDiv:
3315   case Instruction::SDiv:
3316   case Instruction::SRem:
3317   case Instruction::URem:
3318     // If the divisor is loop-invariant no predication is needed.
3319     return !TheLoop->isLoopInvariant(I->getOperand(1));
3320   }
3321 }
3322 
3323 std::pair<InstructionCost, InstructionCost>
3324 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3325                                                     ElementCount VF) const {
3326   assert(I->getOpcode() == Instruction::UDiv ||
3327          I->getOpcode() == Instruction::SDiv ||
3328          I->getOpcode() == Instruction::SRem ||
3329          I->getOpcode() == Instruction::URem);
3330   assert(!isSafeToSpeculativelyExecute(I));
3331 
3332   // Scalarization isn't legal for scalable vector types
3333   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3334   if (!VF.isScalable()) {
3335     // Get the scalarization cost and scale this amount by the probability of
3336     // executing the predicated block. If the instruction is not predicated,
3337     // we fall through to the next case.
3338     ScalarizationCost = 0;
3339 
3340     // These instructions have a non-void type, so account for the phi nodes
3341     // that we will create. This cost is likely to be zero. The phi node
3342     // cost, if any, should be scaled by the block probability because it
3343     // models a copy at the end of each predicated block.
3344     ScalarizationCost += VF.getKnownMinValue() *
3345       TTI.getCFInstrCost(Instruction::PHI, CostKind);
3346 
3347     // The cost of the non-predicated instruction.
3348     ScalarizationCost += VF.getKnownMinValue() *
3349       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3350 
3351     // The cost of insertelement and extractelement instructions needed for
3352     // scalarization.
3353     ScalarizationCost += getScalarizationOverhead(I, VF);
3354 
3355     // Scale the cost by the probability of executing the predicated blocks.
3356     // This assumes the predicated block for each vector lane is equally
3357     // likely.
3358     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3359   }
3360   InstructionCost SafeDivisorCost = 0;
3361 
3362   auto *VecTy = toVectorTy(I->getType(), VF);
3363 
3364   // The cost of the select guard to ensure all lanes are well defined
3365   // after we speculate above any internal control flow.
3366   SafeDivisorCost +=
3367       TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3368                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3369                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
3370 
3371   // Certain instructions can be cheaper to vectorize if they have a constant
3372   // second vector operand. One example of this are shifts on x86.
3373   Value *Op2 = I->getOperand(1);
3374   auto Op2Info = TTI.getOperandInfo(Op2);
3375   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3376       Legal->isInvariant(Op2))
3377     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3378 
3379   SmallVector<const Value *, 4> Operands(I->operand_values());
3380   SafeDivisorCost += TTI.getArithmeticInstrCost(
3381     I->getOpcode(), VecTy, CostKind,
3382     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3383     Op2Info, Operands, I);
3384   return {ScalarizationCost, SafeDivisorCost};
3385 }
3386 
3387 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3388     Instruction *I, ElementCount VF) const {
3389   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3390   assert(getWideningDecision(I, VF) == CM_Unknown &&
3391          "Decision should not be set yet.");
3392   auto *Group = getInterleavedAccessGroup(I);
3393   assert(Group && "Must have a group.");
3394   unsigned InterleaveFactor = Group->getFactor();
3395 
3396   // If the instruction's allocated size doesn't equal its type size, it
3397   // requires padding and will be scalarized.
3398   auto &DL = I->getDataLayout();
3399   auto *ScalarTy = getLoadStoreType(I);
3400   if (hasIrregularType(ScalarTy, DL))
3401     return false;
3402 
3403   // We currently only know how to emit interleave/deinterleave with
3404   // Factor=2 for scalable vectors. This is purely an implementation
3405   // limit.
3406   if (VF.isScalable() && InterleaveFactor != 2)
3407     return false;
3408 
3409   // If the group involves a non-integral pointer, we may not be able to
3410   // losslessly cast all values to a common type.
3411   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3412   for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3413     Instruction *Member = Group->getMember(Idx);
3414     if (!Member)
3415       continue;
3416     auto *MemberTy = getLoadStoreType(Member);
3417     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3418     // Don't coerce non-integral pointers to integers or vice versa.
3419     if (MemberNI != ScalarNI)
3420       // TODO: Consider adding special nullptr value case here
3421       return false;
3422     if (MemberNI && ScalarNI &&
3423         ScalarTy->getPointerAddressSpace() !=
3424             MemberTy->getPointerAddressSpace())
3425       return false;
3426   }
3427 
3428   // Check if masking is required.
3429   // A Group may need masking for one of two reasons: it resides in a block that
3430   // needs predication, or it was decided to use masking to deal with gaps
3431   // (either a gap at the end of a load-access that may result in a speculative
3432   // load, or any gaps in a store-access).
3433   bool PredicatedAccessRequiresMasking =
3434       blockNeedsPredicationForAnyReason(I->getParent()) &&
3435       Legal->isMaskRequired(I);
3436   bool LoadAccessWithGapsRequiresEpilogMasking =
3437       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3438       !isScalarEpilogueAllowed();
3439   bool StoreAccessWithGapsRequiresMasking =
3440       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3441   if (!PredicatedAccessRequiresMasking &&
3442       !LoadAccessWithGapsRequiresEpilogMasking &&
3443       !StoreAccessWithGapsRequiresMasking)
3444     return true;
3445 
3446   // If masked interleaving is required, we expect that the user/target had
3447   // enabled it, because otherwise it either wouldn't have been created or
3448   // it should have been invalidated by the CostModel.
3449   assert(useMaskedInterleavedAccesses(TTI) &&
3450          "Masked interleave-groups for predicated accesses are not enabled.");
3451 
3452   if (Group->isReverse())
3453     return false;
3454 
3455   auto *Ty = getLoadStoreType(I);
3456   const Align Alignment = getLoadStoreAlignment(I);
3457   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3458                           : TTI.isLegalMaskedStore(Ty, Alignment);
3459 }
3460 
3461 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3462     Instruction *I, ElementCount VF) {
3463   // Get and ensure we have a valid memory instruction.
3464   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3465 
3466   auto *Ptr = getLoadStorePointerOperand(I);
3467   auto *ScalarTy = getLoadStoreType(I);
3468 
3469   // In order to be widened, the pointer should be consecutive, first of all.
3470   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3471     return false;
3472 
3473   // If the instruction is a store located in a predicated block, it will be
3474   // scalarized.
3475   if (isScalarWithPredication(I, VF))
3476     return false;
3477 
3478   // If the instruction's allocated size doesn't equal it's type size, it
3479   // requires padding and will be scalarized.
3480   auto &DL = I->getDataLayout();
3481   if (hasIrregularType(ScalarTy, DL))
3482     return false;
3483 
3484   return true;
3485 }
3486 
3487 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3488   // We should not collect Uniforms more than once per VF. Right now,
3489   // this function is called from collectUniformsAndScalars(), which
3490   // already does this check. Collecting Uniforms for VF=1 does not make any
3491   // sense.
3492 
3493   assert(VF.isVector() && !Uniforms.contains(VF) &&
3494          "This function should not be visited twice for the same VF");
3495 
3496   // Visit the list of Uniforms. If we find no uniform value, we won't
3497   // analyze again.  Uniforms.count(VF) will return 1.
3498   Uniforms[VF].clear();
3499 
3500   // Now we know that the loop is vectorizable!
3501   // Collect instructions inside the loop that will remain uniform after
3502   // vectorization.
3503 
3504   // Global values, params and instructions outside of current loop are out of
3505   // scope.
3506   auto IsOutOfScope = [&](Value *V) -> bool {
3507     Instruction *I = dyn_cast<Instruction>(V);
3508     return (!I || !TheLoop->contains(I));
3509   };
3510 
3511   // Worklist containing uniform instructions demanding lane 0.
3512   SetVector<Instruction *> Worklist;
3513 
3514   // Add uniform instructions demanding lane 0 to the worklist. Instructions
3515   // that require predication must not be considered uniform after
3516   // vectorization, because that would create an erroneous replicating region
3517   // where only a single instance out of VF should be formed.
3518   auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3519     if (IsOutOfScope(I)) {
3520       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3521                         << *I << "\n");
3522       return;
3523     }
3524     if (isPredicatedInst(I)) {
3525       LLVM_DEBUG(
3526           dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3527                  << "\n");
3528       return;
3529     }
3530     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3531     Worklist.insert(I);
3532   };
3533 
3534   // Start with the conditional branches exiting the loop. If the branch
3535   // condition is an instruction contained in the loop that is only used by the
3536   // branch, it is uniform. Note conditions from uncountable early exits are not
3537   // uniform.
3538   SmallVector<BasicBlock *> Exiting;
3539   TheLoop->getExitingBlocks(Exiting);
3540   for (BasicBlock *E : Exiting) {
3541     if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3542       continue;
3543     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3544     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3545       AddToWorklistIfAllowed(Cmp);
3546   }
3547 
3548   auto PrevVF = VF.divideCoefficientBy(2);
3549   // Return true if all lanes perform the same memory operation, and we can
3550   // thus choose to execute only one.
3551   auto IsUniformMemOpUse = [&](Instruction *I) {
3552     // If the value was already known to not be uniform for the previous
3553     // (smaller VF), it cannot be uniform for the larger VF.
3554     if (PrevVF.isVector()) {
3555       auto Iter = Uniforms.find(PrevVF);
3556       if (Iter != Uniforms.end() && !Iter->second.contains(I))
3557         return false;
3558     }
3559     if (!Legal->isUniformMemOp(*I, VF))
3560       return false;
3561     if (isa<LoadInst>(I))
3562       // Loading the same address always produces the same result - at least
3563       // assuming aliasing and ordering which have already been checked.
3564       return true;
3565     // Storing the same value on every iteration.
3566     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3567   };
3568 
3569   auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3570     InstWidening WideningDecision = getWideningDecision(I, VF);
3571     assert(WideningDecision != CM_Unknown &&
3572            "Widening decision should be ready at this moment");
3573 
3574     if (IsUniformMemOpUse(I))
3575       return true;
3576 
3577     return (WideningDecision == CM_Widen ||
3578             WideningDecision == CM_Widen_Reverse ||
3579             WideningDecision == CM_Interleave);
3580   };
3581 
3582   // Returns true if Ptr is the pointer operand of a memory access instruction
3583   // I, I is known to not require scalarization, and the pointer is not also
3584   // stored.
3585   auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3586     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3587       return false;
3588     return getLoadStorePointerOperand(I) == Ptr &&
3589            (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3590   };
3591 
3592   // Holds a list of values which are known to have at least one uniform use.
3593   // Note that there may be other uses which aren't uniform.  A "uniform use"
3594   // here is something which only demands lane 0 of the unrolled iterations;
3595   // it does not imply that all lanes produce the same value (e.g. this is not
3596   // the usual meaning of uniform)
3597   SetVector<Value *> HasUniformUse;
3598 
3599   // Scan the loop for instructions which are either a) known to have only
3600   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3601   for (auto *BB : TheLoop->blocks())
3602     for (auto &I : *BB) {
3603       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3604         switch (II->getIntrinsicID()) {
3605         case Intrinsic::sideeffect:
3606         case Intrinsic::experimental_noalias_scope_decl:
3607         case Intrinsic::assume:
3608         case Intrinsic::lifetime_start:
3609         case Intrinsic::lifetime_end:
3610           if (TheLoop->hasLoopInvariantOperands(&I))
3611             AddToWorklistIfAllowed(&I);
3612           break;
3613         default:
3614           break;
3615         }
3616       }
3617 
3618       // ExtractValue instructions must be uniform, because the operands are
3619       // known to be loop-invariant.
3620       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3621         assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3622                "Expected aggregate value to be loop invariant");
3623         AddToWorklistIfAllowed(EVI);
3624         continue;
3625       }
3626 
3627       // If there's no pointer operand, there's nothing to do.
3628       auto *Ptr = getLoadStorePointerOperand(&I);
3629       if (!Ptr)
3630         continue;
3631 
3632       if (IsUniformMemOpUse(&I))
3633         AddToWorklistIfAllowed(&I);
3634 
3635       if (IsVectorizedMemAccessUse(&I, Ptr))
3636         HasUniformUse.insert(Ptr);
3637     }
3638 
3639   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3640   // demanding) users.  Since loops are assumed to be in LCSSA form, this
3641   // disallows uses outside the loop as well.
3642   for (auto *V : HasUniformUse) {
3643     if (IsOutOfScope(V))
3644       continue;
3645     auto *I = cast<Instruction>(V);
3646     bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3647       auto *UI = cast<Instruction>(U);
3648       return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3649     });
3650     if (UsersAreMemAccesses)
3651       AddToWorklistIfAllowed(I);
3652   }
3653 
3654   // Expand Worklist in topological order: whenever a new instruction
3655   // is added , its users should be already inside Worklist.  It ensures
3656   // a uniform instruction will only be used by uniform instructions.
3657   unsigned Idx = 0;
3658   while (Idx != Worklist.size()) {
3659     Instruction *I = Worklist[Idx++];
3660 
3661     for (auto *OV : I->operand_values()) {
3662       // isOutOfScope operands cannot be uniform instructions.
3663       if (IsOutOfScope(OV))
3664         continue;
3665       // First order recurrence Phi's should typically be considered
3666       // non-uniform.
3667       auto *OP = dyn_cast<PHINode>(OV);
3668       if (OP && Legal->isFixedOrderRecurrence(OP))
3669         continue;
3670       // If all the users of the operand are uniform, then add the
3671       // operand into the uniform worklist.
3672       auto *OI = cast<Instruction>(OV);
3673       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3674             auto *J = cast<Instruction>(U);
3675             return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3676           }))
3677         AddToWorklistIfAllowed(OI);
3678     }
3679   }
3680 
3681   // For an instruction to be added into Worklist above, all its users inside
3682   // the loop should also be in Worklist. However, this condition cannot be
3683   // true for phi nodes that form a cyclic dependence. We must process phi
3684   // nodes separately. An induction variable will remain uniform if all users
3685   // of the induction variable and induction variable update remain uniform.
3686   // The code below handles both pointer and non-pointer induction variables.
3687   BasicBlock *Latch = TheLoop->getLoopLatch();
3688   for (const auto &Induction : Legal->getInductionVars()) {
3689     auto *Ind = Induction.first;
3690     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3691 
3692     // Determine if all users of the induction variable are uniform after
3693     // vectorization.
3694     bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3695       auto *I = cast<Instruction>(U);
3696       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3697              IsVectorizedMemAccessUse(I, Ind);
3698     });
3699     if (!UniformInd)
3700       continue;
3701 
3702     // Determine if all users of the induction variable update instruction are
3703     // uniform after vectorization.
3704     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3705       auto *I = cast<Instruction>(U);
3706       return I == Ind || Worklist.count(I) ||
3707              IsVectorizedMemAccessUse(I, IndUpdate);
3708     });
3709     if (!UniformIndUpdate)
3710       continue;
3711 
3712     // The induction variable and its update instruction will remain uniform.
3713     AddToWorklistIfAllowed(Ind);
3714     AddToWorklistIfAllowed(IndUpdate);
3715   }
3716 
3717   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3718 }
3719 
3720 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3721   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3722 
3723   if (Legal->getRuntimePointerChecking()->Need) {
3724     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3725         "runtime pointer checks needed. Enable vectorization of this "
3726         "loop with '#pragma clang loop vectorize(enable)' when "
3727         "compiling with -Os/-Oz",
3728         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3729     return true;
3730   }
3731 
3732   if (!PSE.getPredicate().isAlwaysTrue()) {
3733     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3734         "runtime SCEV checks needed. Enable vectorization of this "
3735         "loop with '#pragma clang loop vectorize(enable)' when "
3736         "compiling with -Os/-Oz",
3737         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3738     return true;
3739   }
3740 
3741   // FIXME: Avoid specializing for stride==1 instead of bailing out.
3742   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3743     reportVectorizationFailure("Runtime stride check for small trip count",
3744         "runtime stride == 1 checks needed. Enable vectorization of "
3745         "this loop without such check by compiling with -Os/-Oz",
3746         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3747     return true;
3748   }
3749 
3750   return false;
3751 }
3752 
3753 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3754   if (IsScalableVectorizationAllowed)
3755     return *IsScalableVectorizationAllowed;
3756 
3757   IsScalableVectorizationAllowed = false;
3758   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3759     return false;
3760 
3761   if (Hints->isScalableVectorizationDisabled()) {
3762     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3763                             "ScalableVectorizationDisabled", ORE, TheLoop);
3764     return false;
3765   }
3766 
3767   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3768 
3769   auto MaxScalableVF = ElementCount::getScalable(
3770       std::numeric_limits<ElementCount::ScalarTy>::max());
3771 
3772   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3773   // FIXME: While for scalable vectors this is currently sufficient, this should
3774   // be replaced by a more detailed mechanism that filters out specific VFs,
3775   // instead of invalidating vectorization for a whole set of VFs based on the
3776   // MaxVF.
3777 
3778   // Disable scalable vectorization if the loop contains unsupported reductions.
3779   if (!canVectorizeReductions(MaxScalableVF)) {
3780     reportVectorizationInfo(
3781         "Scalable vectorization not supported for the reduction "
3782         "operations found in this loop.",
3783         "ScalableVFUnfeasible", ORE, TheLoop);
3784     return false;
3785   }
3786 
3787   // Disable scalable vectorization if the loop contains any instructions
3788   // with element types not supported for scalable vectors.
3789   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3790         return !Ty->isVoidTy() &&
3791                !this->TTI.isElementTypeLegalForScalableVector(Ty);
3792       })) {
3793     reportVectorizationInfo("Scalable vectorization is not supported "
3794                             "for all element types found in this loop.",
3795                             "ScalableVFUnfeasible", ORE, TheLoop);
3796     return false;
3797   }
3798 
3799   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3800     reportVectorizationInfo("The target does not provide maximum vscale value "
3801                             "for safe distance analysis.",
3802                             "ScalableVFUnfeasible", ORE, TheLoop);
3803     return false;
3804   }
3805 
3806   IsScalableVectorizationAllowed = true;
3807   return true;
3808 }
3809 
3810 ElementCount
3811 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3812   if (!isScalableVectorizationAllowed())
3813     return ElementCount::getScalable(0);
3814 
3815   auto MaxScalableVF = ElementCount::getScalable(
3816       std::numeric_limits<ElementCount::ScalarTy>::max());
3817   if (Legal->isSafeForAnyVectorWidth())
3818     return MaxScalableVF;
3819 
3820   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3821   // Limit MaxScalableVF by the maximum safe dependence distance.
3822   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3823 
3824   if (!MaxScalableVF)
3825     reportVectorizationInfo(
3826         "Max legal vector width too small, scalable vectorization "
3827         "unfeasible.",
3828         "ScalableVFUnfeasible", ORE, TheLoop);
3829 
3830   return MaxScalableVF;
3831 }
3832 
3833 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3834     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3835   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3836   unsigned SmallestType, WidestType;
3837   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3838 
3839   // Get the maximum safe dependence distance in bits computed by LAA.
3840   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3841   // the memory accesses that is most restrictive (involved in the smallest
3842   // dependence distance).
3843   unsigned MaxSafeElements =
3844       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3845 
3846   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3847   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3848   if (!Legal->isSafeForAnyVectorWidth())
3849     this->MaxSafeElements = MaxSafeElements;
3850 
3851   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3852                     << ".\n");
3853   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3854                     << ".\n");
3855 
3856   // First analyze the UserVF, fall back if the UserVF should be ignored.
3857   if (UserVF) {
3858     auto MaxSafeUserVF =
3859         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3860 
3861     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3862       // If `VF=vscale x N` is safe, then so is `VF=N`
3863       if (UserVF.isScalable())
3864         return FixedScalableVFPair(
3865             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3866 
3867       return UserVF;
3868     }
3869 
3870     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3871 
3872     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3873     // is better to ignore the hint and let the compiler choose a suitable VF.
3874     if (!UserVF.isScalable()) {
3875       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3876                         << " is unsafe, clamping to max safe VF="
3877                         << MaxSafeFixedVF << ".\n");
3878       ORE->emit([&]() {
3879         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3880                                           TheLoop->getStartLoc(),
3881                                           TheLoop->getHeader())
3882                << "User-specified vectorization factor "
3883                << ore::NV("UserVectorizationFactor", UserVF)
3884                << " is unsafe, clamping to maximum safe vectorization factor "
3885                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3886       });
3887       return MaxSafeFixedVF;
3888     }
3889 
3890     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3891       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3892                         << " is ignored because scalable vectors are not "
3893                            "available.\n");
3894       ORE->emit([&]() {
3895         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3896                                           TheLoop->getStartLoc(),
3897                                           TheLoop->getHeader())
3898                << "User-specified vectorization factor "
3899                << ore::NV("UserVectorizationFactor", UserVF)
3900                << " is ignored because the target does not support scalable "
3901                   "vectors. The compiler will pick a more suitable value.";
3902       });
3903     } else {
3904       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3905                         << " is unsafe. Ignoring scalable UserVF.\n");
3906       ORE->emit([&]() {
3907         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3908                                           TheLoop->getStartLoc(),
3909                                           TheLoop->getHeader())
3910                << "User-specified vectorization factor "
3911                << ore::NV("UserVectorizationFactor", UserVF)
3912                << " is unsafe. Ignoring the hint to let the compiler pick a "
3913                   "more suitable value.";
3914       });
3915     }
3916   }
3917 
3918   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3919                     << " / " << WidestType << " bits.\n");
3920 
3921   FixedScalableVFPair Result(ElementCount::getFixed(1),
3922                              ElementCount::getScalable(0));
3923   if (auto MaxVF =
3924           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3925                                   MaxSafeFixedVF, FoldTailByMasking))
3926     Result.FixedVF = MaxVF;
3927 
3928   if (auto MaxVF =
3929           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3930                                   MaxSafeScalableVF, FoldTailByMasking))
3931     if (MaxVF.isScalable()) {
3932       Result.ScalableVF = MaxVF;
3933       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3934                         << "\n");
3935     }
3936 
3937   return Result;
3938 }
3939 
3940 FixedScalableVFPair
3941 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3942   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3943     // TODO: It may be useful to do since it's still likely to be dynamically
3944     // uniform if the target can skip.
3945     reportVectorizationFailure(
3946         "Not inserting runtime ptr check for divergent target",
3947         "runtime pointer checks needed. Not enabled for divergent target",
3948         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3949     return FixedScalableVFPair::getNone();
3950   }
3951 
3952   ScalarEvolution *SE = PSE.getSE();
3953   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
3954   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3955   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3956   if (TC != MaxTC)
3957     LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3958   if (TC == 1) {
3959     reportVectorizationFailure("Single iteration (non) loop",
3960         "loop trip count is one, irrelevant for vectorization",
3961         "SingleIterationLoop", ORE, TheLoop);
3962     return FixedScalableVFPair::getNone();
3963   }
3964 
3965   // If BTC matches the widest induction type and is -1 then the trip count
3966   // computation will wrap to 0 and the vector trip count will be 0. Do not try
3967   // to vectorize.
3968   const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3969   if (!isa<SCEVCouldNotCompute>(BTC) &&
3970       BTC->getType()->getScalarSizeInBits() >=
3971           Legal->getWidestInductionType()->getScalarSizeInBits() &&
3972       SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,
3973                            SE->getMinusOne(BTC->getType()))) {
3974     reportVectorizationFailure(
3975         "Trip count computation wrapped",
3976         "backedge-taken count is -1, loop trip count wrapped to 0",
3977         "TripCountWrapped", ORE, TheLoop);
3978     return FixedScalableVFPair::getNone();
3979   }
3980 
3981   switch (ScalarEpilogueStatus) {
3982   case CM_ScalarEpilogueAllowed:
3983     return computeFeasibleMaxVF(MaxTC, UserVF, false);
3984   case CM_ScalarEpilogueNotAllowedUsePredicate:
3985     [[fallthrough]];
3986   case CM_ScalarEpilogueNotNeededUsePredicate:
3987     LLVM_DEBUG(
3988         dbgs() << "LV: vector predicate hint/switch found.\n"
3989                << "LV: Not allowing scalar epilogue, creating predicated "
3990                << "vector loop.\n");
3991     break;
3992   case CM_ScalarEpilogueNotAllowedLowTripLoop:
3993     // fallthrough as a special case of OptForSize
3994   case CM_ScalarEpilogueNotAllowedOptSize:
3995     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3996       LLVM_DEBUG(
3997           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3998     else
3999       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4000                         << "count.\n");
4001 
4002     // Bail if runtime checks are required, which are not good when optimising
4003     // for size.
4004     if (runtimeChecksRequired())
4005       return FixedScalableVFPair::getNone();
4006 
4007     break;
4008   }
4009 
4010   // The only loops we can vectorize without a scalar epilogue, are loops with
4011   // a bottom-test and a single exiting block. We'd have to handle the fact
4012   // that not every instruction executes on the last iteration.  This will
4013   // require a lane mask which varies through the vector loop body.  (TODO)
4014   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4015     // If there was a tail-folding hint/switch, but we can't fold the tail by
4016     // masking, fallback to a vectorization with a scalar epilogue.
4017     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4018       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4019                            "scalar epilogue instead.\n");
4020       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4021       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4022     }
4023     return FixedScalableVFPair::getNone();
4024   }
4025 
4026   // Now try the tail folding
4027 
4028   // Invalidate interleave groups that require an epilogue if we can't mask
4029   // the interleave-group.
4030   if (!useMaskedInterleavedAccesses(TTI)) {
4031     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4032            "No decisions should have been taken at this point");
4033     // Note: There is no need to invalidate any cost modeling decisions here, as
4034     // none were taken so far.
4035     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4036   }
4037 
4038   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4039 
4040   // Avoid tail folding if the trip count is known to be a multiple of any VF
4041   // we choose.
4042   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4043       MaxFactors.FixedVF.getFixedValue();
4044   if (MaxFactors.ScalableVF) {
4045     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4046     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4047       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4048           *MaxPowerOf2RuntimeVF,
4049           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4050     } else
4051       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4052   }
4053 
4054   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4055     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4056            "MaxFixedVF must be a power of 2");
4057     unsigned MaxVFtimesIC =
4058         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4059     ScalarEvolution *SE = PSE.getSE();
4060     // Currently only loops with countable exits are vectorized, but calling
4061     // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4062     // uncountable exits whilst also ensuring the symbolic maximum and known
4063     // back-edge taken count remain identical for loops with countable exits.
4064     const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4065     assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4066            "Invalid loop count");
4067     const SCEV *ExitCount = SE->getAddExpr(
4068         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4069     const SCEV *Rem = SE->getURemExpr(
4070         SE->applyLoopGuards(ExitCount, TheLoop),
4071         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4072     if (Rem->isZero()) {
4073       // Accept MaxFixedVF if we do not have a tail.
4074       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4075       return MaxFactors;
4076     }
4077   }
4078 
4079   // If we don't know the precise trip count, or if the trip count that we
4080   // found modulo the vectorization factor is not zero, try to fold the tail
4081   // by masking.
4082   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4083   setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4084   if (foldTailByMasking()) {
4085     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4086       LLVM_DEBUG(
4087           dbgs()
4088           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4089              "try to generate VP Intrinsics with scalable vector "
4090              "factors only.\n");
4091       // Tail folded loop using VP intrinsics restricts the VF to be scalable
4092       // for now.
4093       // TODO: extend it for fixed vectors, if required.
4094       assert(MaxFactors.ScalableVF.isScalable() &&
4095              "Expected scalable vector factor.");
4096 
4097       MaxFactors.FixedVF = ElementCount::getFixed(1);
4098     }
4099     return MaxFactors;
4100   }
4101 
4102   // If there was a tail-folding hint/switch, but we can't fold the tail by
4103   // masking, fallback to a vectorization with a scalar epilogue.
4104   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4105     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4106                          "scalar epilogue instead.\n");
4107     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4108     return MaxFactors;
4109   }
4110 
4111   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4112     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4113     return FixedScalableVFPair::getNone();
4114   }
4115 
4116   if (TC == 0) {
4117     reportVectorizationFailure(
4118         "unable to calculate the loop count due to complex control flow",
4119         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4120     return FixedScalableVFPair::getNone();
4121   }
4122 
4123   reportVectorizationFailure(
4124       "Cannot optimize for size and vectorize at the same time.",
4125       "cannot optimize for size and vectorize at the same time. "
4126       "Enable vectorization of this loop with '#pragma clang loop "
4127       "vectorize(enable)' when compiling with -Os/-Oz",
4128       "NoTailLoopWithOptForSize", ORE, TheLoop);
4129   return FixedScalableVFPair::getNone();
4130 }
4131 
4132 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4133     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4134     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4135   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4136   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4137       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4138                            : TargetTransformInfo::RGK_FixedWidthVector);
4139 
4140   // Convenience function to return the minimum of two ElementCounts.
4141   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4142     assert((LHS.isScalable() == RHS.isScalable()) &&
4143            "Scalable flags must match");
4144     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4145   };
4146 
4147   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4148   // Note that both WidestRegister and WidestType may not be a powers of 2.
4149   auto MaxVectorElementCount = ElementCount::get(
4150       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4151       ComputeScalableMaxVF);
4152   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4153   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4154                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4155 
4156   if (!MaxVectorElementCount) {
4157     LLVM_DEBUG(dbgs() << "LV: The target has no "
4158                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4159                       << " vector registers.\n");
4160     return ElementCount::getFixed(1);
4161   }
4162 
4163   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4164   if (MaxVectorElementCount.isScalable() &&
4165       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4166     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4167     auto Min = Attr.getVScaleRangeMin();
4168     WidestRegisterMinEC *= Min;
4169   }
4170 
4171   // When a scalar epilogue is required, at least one iteration of the scalar
4172   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4173   // max VF that results in a dead vector loop.
4174   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4175     MaxTripCount -= 1;
4176 
4177   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4178       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4179     // If upper bound loop trip count (TC) is known at compile time there is no
4180     // point in choosing VF greater than TC (as done in the loop below). Select
4181     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4182     // scalable, we only fall back on a fixed VF when the TC is less than or
4183     // equal to the known number of lanes.
4184     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4185     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4186                          "exceeding the constant trip count: "
4187                       << ClampedUpperTripCount << "\n");
4188     return ElementCount::get(
4189         ClampedUpperTripCount,
4190         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4191   }
4192 
4193   TargetTransformInfo::RegisterKind RegKind =
4194       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4195                            : TargetTransformInfo::RGK_FixedWidthVector;
4196   ElementCount MaxVF = MaxVectorElementCount;
4197   if (MaximizeBandwidth ||
4198       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4199        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4200         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4201     auto MaxVectorElementCountMaxBW = ElementCount::get(
4202         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4203         ComputeScalableMaxVF);
4204     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4205 
4206     // Collect all viable vectorization factors larger than the default MaxVF
4207     // (i.e. MaxVectorElementCount).
4208     SmallVector<ElementCount, 8> VFs;
4209     for (ElementCount VS = MaxVectorElementCount * 2;
4210          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4211       VFs.push_back(VS);
4212 
4213     // For each VF calculate its register usage.
4214     auto RUs = calculateRegisterUsage(VFs);
4215 
4216     // Select the largest VF which doesn't require more registers than existing
4217     // ones.
4218     for (int I = RUs.size() - 1; I >= 0; --I) {
4219       const auto &MLU = RUs[I].MaxLocalUsers;
4220       if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4221             return LU.second <= TTI.getNumberOfRegisters(LU.first);
4222           })) {
4223         MaxVF = VFs[I];
4224         break;
4225       }
4226     }
4227     if (ElementCount MinVF =
4228             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4229       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4230         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4231                           << ") with target's minimum: " << MinVF << '\n');
4232         MaxVF = MinVF;
4233       }
4234     }
4235 
4236     // Invalidate any widening decisions we might have made, in case the loop
4237     // requires prediction (decided later), but we have already made some
4238     // load/store widening decisions.
4239     invalidateCostModelingDecisions();
4240   }
4241   return MaxVF;
4242 }
4243 
4244 /// Convenience function that returns the value of vscale_range iff
4245 /// vscale_range.min == vscale_range.max or otherwise returns the value
4246 /// returned by the corresponding TTI method.
4247 static std::optional<unsigned>
4248 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4249   const Function *Fn = L->getHeader()->getParent();
4250   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4251     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4252     auto Min = Attr.getVScaleRangeMin();
4253     auto Max = Attr.getVScaleRangeMax();
4254     if (Max && Min == Max)
4255       return Max;
4256   }
4257 
4258   return TTI.getVScaleForTuning();
4259 }
4260 
4261 /// This function attempts to return a value that represents the vectorization
4262 /// factor at runtime. For fixed-width VFs we know this precisely at compile
4263 /// time, but for scalable VFs we calculate it based on an estimate of the
4264 /// vscale value.
4265 static unsigned getEstimatedRuntimeVF(const Loop *L,
4266                                       const TargetTransformInfo &TTI,
4267                                       ElementCount VF) {
4268   unsigned EstimatedVF = VF.getKnownMinValue();
4269   if (VF.isScalable())
4270     if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4271       EstimatedVF *= *VScale;
4272   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4273   return EstimatedVF;
4274 }
4275 
4276 bool LoopVectorizationPlanner::isMoreProfitable(
4277     const VectorizationFactor &A, const VectorizationFactor &B,
4278     const unsigned MaxTripCount) const {
4279   InstructionCost CostA = A.Cost;
4280   InstructionCost CostB = B.Cost;
4281 
4282   // Improve estimate for the vector width if it is scalable.
4283   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4284   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4285   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4286     if (A.Width.isScalable())
4287       EstimatedWidthA *= *VScale;
4288     if (B.Width.isScalable())
4289       EstimatedWidthB *= *VScale;
4290   }
4291 
4292   // Assume vscale may be larger than 1 (or the value being tuned for),
4293   // so that scalable vectorization is slightly favorable over fixed-width
4294   // vectorization.
4295   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4296                         A.Width.isScalable() && !B.Width.isScalable();
4297 
4298   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4299                                 const InstructionCost &RHS) {
4300     return PreferScalable ? LHS <= RHS : LHS < RHS;
4301   };
4302 
4303   // To avoid the need for FP division:
4304   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4305   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4306   if (!MaxTripCount)
4307     return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4308 
4309   auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4310                                            InstructionCost VectorCost,
4311                                            InstructionCost ScalarCost) {
4312     // If the trip count is a known (possibly small) constant, the trip count
4313     // will be rounded up to an integer number of iterations under
4314     // FoldTailByMasking. The total cost in that case will be
4315     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4316     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4317     // some extra overheads, but for the purpose of comparing the costs of
4318     // different VFs we can use this to compare the total loop-body cost
4319     // expected after vectorization.
4320     if (CM.foldTailByMasking())
4321       return VectorCost * divideCeil(MaxTripCount, VF);
4322     return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4323   };
4324 
4325   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4326   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4327   return CmpFn(RTCostA, RTCostB);
4328 }
4329 
4330 bool LoopVectorizationPlanner::isMoreProfitable(
4331     const VectorizationFactor &A, const VectorizationFactor &B) const {
4332   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4333   return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4334 }
4335 
4336 void LoopVectorizationPlanner::emitInvalidCostRemarks(
4337     OptimizationRemarkEmitter *ORE) {
4338   using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4339   SmallVector<RecipeVFPair> InvalidCosts;
4340   for (const auto &Plan : VPlans) {
4341     for (ElementCount VF : Plan->vectorFactors()) {
4342       VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4343                             CM, CM.CostKind);
4344       precomputeCosts(*Plan, VF, CostCtx);
4345       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4346       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4347         for (auto &R : *VPBB) {
4348           if (!R.cost(VF, CostCtx).isValid())
4349             InvalidCosts.emplace_back(&R, VF);
4350         }
4351       }
4352     }
4353   }
4354   if (InvalidCosts.empty())
4355     return;
4356 
4357   // Emit a report of VFs with invalid costs in the loop.
4358 
4359   // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4360   DenseMap<VPRecipeBase *, unsigned> Numbering;
4361   unsigned I = 0;
4362   for (auto &Pair : InvalidCosts)
4363     if (!Numbering.count(Pair.first))
4364       Numbering[Pair.first] = I++;
4365 
4366   // Sort the list, first on recipe(number) then on VF.
4367   sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4368     if (Numbering[A.first] != Numbering[B.first])
4369       return Numbering[A.first] < Numbering[B.first];
4370     const auto &LHS = A.second;
4371     const auto &RHS = B.second;
4372     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4373            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4374   });
4375 
4376   // For a list of ordered recipe-VF pairs:
4377   //   [(load, VF1), (load, VF2), (store, VF1)]
4378   // group the recipes together to emit separate remarks for:
4379   //   load  (VF1, VF2)
4380   //   store (VF1)
4381   auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4382   auto Subset = ArrayRef<RecipeVFPair>();
4383   do {
4384     if (Subset.empty())
4385       Subset = Tail.take_front(1);
4386 
4387     VPRecipeBase *R = Subset.front().first;
4388 
4389     unsigned Opcode =
4390         TypeSwitch<const VPRecipeBase *, unsigned>(R)
4391             .Case<VPHeaderPHIRecipe>(
4392                 [](const auto *R) { return Instruction::PHI; })
4393             .Case<VPWidenSelectRecipe>(
4394                 [](const auto *R) { return Instruction::Select; })
4395             .Case<VPWidenStoreRecipe>(
4396                 [](const auto *R) { return Instruction::Store; })
4397             .Case<VPWidenLoadRecipe>(
4398                 [](const auto *R) { return Instruction::Load; })
4399             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4400                 [](const auto *R) { return Instruction::Call; })
4401             .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4402                   VPWidenCastRecipe>(
4403                 [](const auto *R) { return R->getOpcode(); })
4404             .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4405               return R->getStoredValues().empty() ? Instruction::Load
4406                                                   : Instruction::Store;
4407             });
4408 
4409     // If the next recipe is different, or if there are no other pairs,
4410     // emit a remark for the collated subset. e.g.
4411     //   [(load, VF1), (load, VF2))]
4412     // to emit:
4413     //  remark: invalid costs for 'load' at VF=(VF1, VF2)
4414     if (Subset == Tail || Tail[Subset.size()].first != R) {
4415       std::string OutString;
4416       raw_string_ostream OS(OutString);
4417       assert(!Subset.empty() && "Unexpected empty range");
4418       OS << "Recipe with invalid costs prevented vectorization at VF=(";
4419       for (const auto &Pair : Subset)
4420         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4421       OS << "):";
4422       if (Opcode == Instruction::Call) {
4423         StringRef Name = "";
4424         if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4425           Name = Int->getIntrinsicName();
4426         } else {
4427           auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4428           Function *CalledFn =
4429               WidenCall ? WidenCall->getCalledScalarFunction()
4430                         : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4431                                              ->getLiveInIRValue());
4432           Name = CalledFn->getName();
4433         }
4434         OS << " call to " << Name;
4435       } else
4436         OS << " " << Instruction::getOpcodeName(Opcode);
4437       reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4438                               R->getDebugLoc());
4439       Tail = Tail.drop_front(Subset.size());
4440       Subset = {};
4441     } else
4442       // Grow the subset by one element
4443       Subset = Tail.take_front(Subset.size() + 1);
4444   } while (!Tail.empty());
4445 }
4446 
4447 /// Check if any recipe of \p Plan will generate a vector value, which will be
4448 /// assigned a vector register.
4449 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4450                                 const TargetTransformInfo &TTI) {
4451   assert(VF.isVector() && "Checking a scalar VF?");
4452   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4453   DenseSet<VPRecipeBase *> EphemeralRecipes;
4454   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4455   // Set of already visited types.
4456   DenseSet<Type *> Visited;
4457   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4458            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4459     for (VPRecipeBase &R : *VPBB) {
4460       if (EphemeralRecipes.contains(&R))
4461         continue;
4462       // Continue early if the recipe is considered to not produce a vector
4463       // result. Note that this includes VPInstruction where some opcodes may
4464       // produce a vector, to preserve existing behavior as VPInstructions model
4465       // aspects not directly mapped to existing IR instructions.
4466       switch (R.getVPDefID()) {
4467       case VPDef::VPDerivedIVSC:
4468       case VPDef::VPScalarIVStepsSC:
4469       case VPDef::VPScalarCastSC:
4470       case VPDef::VPReplicateSC:
4471       case VPDef::VPInstructionSC:
4472       case VPDef::VPCanonicalIVPHISC:
4473       case VPDef::VPVectorPointerSC:
4474       case VPDef::VPReverseVectorPointerSC:
4475       case VPDef::VPExpandSCEVSC:
4476       case VPDef::VPEVLBasedIVPHISC:
4477       case VPDef::VPPredInstPHISC:
4478       case VPDef::VPBranchOnMaskSC:
4479         continue;
4480       case VPDef::VPReductionSC:
4481       case VPDef::VPActiveLaneMaskPHISC:
4482       case VPDef::VPWidenCallSC:
4483       case VPDef::VPWidenCanonicalIVSC:
4484       case VPDef::VPWidenCastSC:
4485       case VPDef::VPWidenGEPSC:
4486       case VPDef::VPWidenIntrinsicSC:
4487       case VPDef::VPWidenSC:
4488       case VPDef::VPWidenSelectSC:
4489       case VPDef::VPBlendSC:
4490       case VPDef::VPFirstOrderRecurrencePHISC:
4491       case VPDef::VPWidenPHISC:
4492       case VPDef::VPWidenIntOrFpInductionSC:
4493       case VPDef::VPWidenPointerInductionSC:
4494       case VPDef::VPReductionPHISC:
4495       case VPDef::VPInterleaveSC:
4496       case VPDef::VPWidenLoadEVLSC:
4497       case VPDef::VPWidenLoadSC:
4498       case VPDef::VPWidenStoreEVLSC:
4499       case VPDef::VPWidenStoreSC:
4500         break;
4501       default:
4502         llvm_unreachable("unhandled recipe");
4503       }
4504 
4505       auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4506         Type *VectorTy = toVectorTy(ScalarTy, VF);
4507         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4508         if (!NumLegalParts)
4509           return false;
4510         if (VF.isScalable()) {
4511           // <vscale x 1 x iN> is assumed to be profitable over iN because
4512           // scalable registers are a distinct register class from scalar
4513           // ones. If we ever find a target which wants to lower scalable
4514           // vectors back to scalars, we'll need to update this code to
4515           // explicitly ask TTI about the register class uses for each part.
4516           return NumLegalParts <= VF.getKnownMinValue();
4517         }
4518         // Two or more parts that share a register - are vectorized.
4519         return NumLegalParts < VF.getKnownMinValue();
4520       };
4521 
4522       // If no def nor is a store, e.g., branches, continue - no value to check.
4523       if (R.getNumDefinedValues() == 0 &&
4524           !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4525               &R))
4526         continue;
4527       // For multi-def recipes, currently only interleaved loads, suffice to
4528       // check first def only.
4529       // For stores check their stored value; for interleaved stores suffice
4530       // the check first stored value only. In all cases this is the second
4531       // operand.
4532       VPValue *ToCheck =
4533           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4534       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4535       if (!Visited.insert({ScalarTy}).second)
4536         continue;
4537       if (WillWiden(ScalarTy))
4538         return true;
4539     }
4540   }
4541 
4542   return false;
4543 }
4544 
4545 #ifndef NDEBUG
4546 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4547   InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4548   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4549   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4550   assert(any_of(VPlans,
4551                 [](std::unique_ptr<VPlan> &P) {
4552                   return P->hasVF(ElementCount::getFixed(1));
4553                 }) &&
4554          "Expected Scalar VF to be a candidate");
4555 
4556   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4557                                        ExpectedCost);
4558   VectorizationFactor ChosenFactor = ScalarCost;
4559 
4560   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4561   if (ForceVectorization &&
4562       (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4563     // Ignore scalar width, because the user explicitly wants vectorization.
4564     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4565     // evaluation.
4566     ChosenFactor.Cost = InstructionCost::getMax();
4567   }
4568 
4569   for (auto &P : VPlans) {
4570     for (ElementCount VF : P->vectorFactors()) {
4571       // The cost for scalar VF=1 is already calculated, so ignore it.
4572       if (VF.isScalar())
4573         continue;
4574 
4575       InstructionCost C = CM.expectedCost(VF);
4576       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4577 
4578       unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4579       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4580                         << " costs: " << (Candidate.Cost / Width));
4581       if (VF.isScalable())
4582         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4583                           << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4584                           << ")");
4585       LLVM_DEBUG(dbgs() << ".\n");
4586 
4587       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4588         LLVM_DEBUG(
4589             dbgs()
4590             << "LV: Not considering vector loop of width " << VF
4591             << " because it will not generate any vector instructions.\n");
4592         continue;
4593       }
4594 
4595       if (isMoreProfitable(Candidate, ChosenFactor))
4596         ChosenFactor = Candidate;
4597     }
4598   }
4599 
4600   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4601     reportVectorizationFailure(
4602         "There are conditional stores.",
4603         "store that is conditionally executed prevents vectorization",
4604         "ConditionalStore", ORE, OrigLoop);
4605     ChosenFactor = ScalarCost;
4606   }
4607 
4608   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4609                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4610              << "LV: Vectorization seems to be not beneficial, "
4611              << "but was forced by a user.\n");
4612   return ChosenFactor;
4613 }
4614 #endif
4615 
4616 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4617     ElementCount VF) const {
4618   // Cross iteration phis such as reductions need special handling and are
4619   // currently unsupported.
4620   if (any_of(OrigLoop->getHeader()->phis(),
4621              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4622     return false;
4623 
4624   // Phis with uses outside of the loop require special handling and are
4625   // currently unsupported.
4626   for (const auto &Entry : Legal->getInductionVars()) {
4627     // Look for uses of the value of the induction at the last iteration.
4628     Value *PostInc =
4629         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4630     for (User *U : PostInc->users())
4631       if (!OrigLoop->contains(cast<Instruction>(U)))
4632         return false;
4633     // Look for uses of penultimate value of the induction.
4634     for (User *U : Entry.first->users())
4635       if (!OrigLoop->contains(cast<Instruction>(U)))
4636         return false;
4637   }
4638 
4639   // Epilogue vectorization code has not been auditted to ensure it handles
4640   // non-latch exits properly.  It may be fine, but it needs auditted and
4641   // tested.
4642   // TODO: Add support for loops with an early exit.
4643   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4644     return false;
4645 
4646   return true;
4647 }
4648 
4649 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4650     const ElementCount VF, const unsigned IC) const {
4651   // FIXME: We need a much better cost-model to take different parameters such
4652   // as register pressure, code size increase and cost of extra branches into
4653   // account. For now we apply a very crude heuristic and only consider loops
4654   // with vectorization factors larger than a certain value.
4655 
4656   // Allow the target to opt out entirely.
4657   if (!TTI.preferEpilogueVectorization())
4658     return false;
4659 
4660   // We also consider epilogue vectorization unprofitable for targets that don't
4661   // consider interleaving beneficial (eg. MVE).
4662   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4663     return false;
4664 
4665   // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4666   // VFs when deciding profitability.
4667   // See related "TODO: extend to support scalable VFs." in
4668   // selectEpilogueVectorizationFactor.
4669   unsigned Multiplier = VF.isFixed() ? IC : 1;
4670   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4671                                 ? EpilogueVectorizationMinVF
4672                                 : TTI.getEpilogueVectorizationMinVF();
4673   return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4674 }
4675 
4676 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4677     const ElementCount MainLoopVF, unsigned IC) {
4678   VectorizationFactor Result = VectorizationFactor::Disabled();
4679   if (!EnableEpilogueVectorization) {
4680     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4681     return Result;
4682   }
4683 
4684   if (!CM.isScalarEpilogueAllowed()) {
4685     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4686                          "epilogue is allowed.\n");
4687     return Result;
4688   }
4689 
4690   // Not really a cost consideration, but check for unsupported cases here to
4691   // simplify the logic.
4692   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4693     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4694                          "is not a supported candidate.\n");
4695     return Result;
4696   }
4697 
4698   if (EpilogueVectorizationForceVF > 1) {
4699     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4700     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4701     if (hasPlanWithVF(ForcedEC))
4702       return {ForcedEC, 0, 0};
4703 
4704     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4705                          "viable.\n");
4706     return Result;
4707   }
4708 
4709   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4710       OrigLoop->getHeader()->getParent()->hasMinSize()) {
4711     LLVM_DEBUG(
4712         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4713     return Result;
4714   }
4715 
4716   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4717     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4718                          "this loop\n");
4719     return Result;
4720   }
4721 
4722   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4723   // the main loop handles 8 lanes per iteration. We could still benefit from
4724   // vectorizing the epilogue loop with VF=4.
4725   ElementCount EstimatedRuntimeVF =
4726       ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4727 
4728   ScalarEvolution &SE = *PSE.getSE();
4729   Type *TCType = Legal->getWidestInductionType();
4730   const SCEV *RemainingIterations = nullptr;
4731   unsigned MaxTripCount = 0;
4732   for (auto &NextVF : ProfitableVFs) {
4733     // Skip candidate VFs without a corresponding VPlan.
4734     if (!hasPlanWithVF(NextVF.Width))
4735       continue;
4736 
4737     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4738     // vectors) or > the VF of the main loop (fixed vectors).
4739     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4740          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4741         (NextVF.Width.isScalable() &&
4742          ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4743         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4744          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4745       continue;
4746 
4747     // If NextVF is greater than the number of remaining iterations, the
4748     // epilogue loop would be dead. Skip such factors.
4749     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4750       // TODO: extend to support scalable VFs.
4751       if (!RemainingIterations) {
4752         const SCEV *TC = vputils::getSCEVExprForVPValue(
4753             getPlanFor(NextVF.Width).getTripCount(), SE);
4754         assert(!isa<SCEVCouldNotCompute>(TC) &&
4755                "Trip count SCEV must be computable");
4756         RemainingIterations = SE.getURemExpr(
4757             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4758         MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4759         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4760                                 SE.getConstant(TCType, MaxTripCount))) {
4761           MaxTripCount =
4762               SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4763         }
4764         LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4765                           << MaxTripCount << "\n");
4766       }
4767       if (SE.isKnownPredicate(
4768               CmpInst::ICMP_UGT,
4769               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4770               RemainingIterations))
4771         continue;
4772     }
4773 
4774     if (Result.Width.isScalar() ||
4775         isMoreProfitable(NextVF, Result, MaxTripCount))
4776       Result = NextVF;
4777   }
4778 
4779   if (Result != VectorizationFactor::Disabled())
4780     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4781                       << Result.Width << "\n");
4782   return Result;
4783 }
4784 
4785 std::pair<unsigned, unsigned>
4786 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4787   unsigned MinWidth = -1U;
4788   unsigned MaxWidth = 8;
4789   const DataLayout &DL = TheFunction->getDataLayout();
4790   // For in-loop reductions, no element types are added to ElementTypesInLoop
4791   // if there are no loads/stores in the loop. In this case, check through the
4792   // reduction variables to determine the maximum width.
4793   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4794     // Reset MaxWidth so that we can find the smallest type used by recurrences
4795     // in the loop.
4796     MaxWidth = -1U;
4797     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4798       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4799       // When finding the min width used by the recurrence we need to account
4800       // for casts on the input operands of the recurrence.
4801       MaxWidth = std::min<unsigned>(
4802           MaxWidth, std::min<unsigned>(
4803                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4804                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4805     }
4806   } else {
4807     for (Type *T : ElementTypesInLoop) {
4808       MinWidth = std::min<unsigned>(
4809           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4810       MaxWidth = std::max<unsigned>(
4811           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4812     }
4813   }
4814   return {MinWidth, MaxWidth};
4815 }
4816 
4817 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4818   ElementTypesInLoop.clear();
4819   // For each block.
4820   for (BasicBlock *BB : TheLoop->blocks()) {
4821     // For each instruction in the loop.
4822     for (Instruction &I : BB->instructionsWithoutDebug()) {
4823       Type *T = I.getType();
4824 
4825       // Skip ignored values.
4826       if (ValuesToIgnore.count(&I))
4827         continue;
4828 
4829       // Only examine Loads, Stores and PHINodes.
4830       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4831         continue;
4832 
4833       // Examine PHI nodes that are reduction variables. Update the type to
4834       // account for the recurrence type.
4835       if (auto *PN = dyn_cast<PHINode>(&I)) {
4836         if (!Legal->isReductionVariable(PN))
4837           continue;
4838         const RecurrenceDescriptor &RdxDesc =
4839             Legal->getReductionVars().find(PN)->second;
4840         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4841             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
4842                                       RdxDesc.getRecurrenceType(),
4843                                       TargetTransformInfo::ReductionFlags()))
4844           continue;
4845         T = RdxDesc.getRecurrenceType();
4846       }
4847 
4848       // Examine the stored values.
4849       if (auto *ST = dyn_cast<StoreInst>(&I))
4850         T = ST->getValueOperand()->getType();
4851 
4852       assert(T->isSized() &&
4853              "Expected the load/store/recurrence type to be sized");
4854 
4855       ElementTypesInLoop.insert(T);
4856     }
4857   }
4858 }
4859 
4860 unsigned
4861 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4862                                                   InstructionCost LoopCost) {
4863   // -- The interleave heuristics --
4864   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4865   // There are many micro-architectural considerations that we can't predict
4866   // at this level. For example, frontend pressure (on decode or fetch) due to
4867   // code size, or the number and capabilities of the execution ports.
4868   //
4869   // We use the following heuristics to select the interleave count:
4870   // 1. If the code has reductions, then we interleave to break the cross
4871   // iteration dependency.
4872   // 2. If the loop is really small, then we interleave to reduce the loop
4873   // overhead.
4874   // 3. We don't interleave if we think that we will spill registers to memory
4875   // due to the increased register pressure.
4876 
4877   if (!isScalarEpilogueAllowed())
4878     return 1;
4879 
4880   // Do not interleave if EVL is preferred and no User IC is specified.
4881   if (foldTailWithEVL()) {
4882     LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4883                          "Unroll factor forced to be 1.\n");
4884     return 1;
4885   }
4886 
4887   // We used the distance for the interleave count.
4888   if (!Legal->isSafeForAnyVectorWidth())
4889     return 1;
4890 
4891   // We don't attempt to perform interleaving for loops with uncountable early
4892   // exits because the VPInstruction::AnyOf code cannot currently handle
4893   // multiple parts.
4894   if (Legal->hasUncountableEarlyExit())
4895     return 1;
4896 
4897   auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4898   const bool HasReductions = !Legal->getReductionVars().empty();
4899 
4900   // If we did not calculate the cost for VF (because the user selected the VF)
4901   // then we calculate the cost of VF here.
4902   if (LoopCost == 0) {
4903     LoopCost = expectedCost(VF);
4904     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4905 
4906     // Loop body is free and there is no need for interleaving.
4907     if (LoopCost == 0)
4908       return 1;
4909   }
4910 
4911   RegisterUsage R = calculateRegisterUsage({VF})[0];
4912   // We divide by these constants so assume that we have at least one
4913   // instruction that uses at least one register.
4914   for (auto &Pair : R.MaxLocalUsers) {
4915     Pair.second = std::max(Pair.second, 1U);
4916   }
4917 
4918   // We calculate the interleave count using the following formula.
4919   // Subtract the number of loop invariants from the number of available
4920   // registers. These registers are used by all of the interleaved instances.
4921   // Next, divide the remaining registers by the number of registers that is
4922   // required by the loop, in order to estimate how many parallel instances
4923   // fit without causing spills. All of this is rounded down if necessary to be
4924   // a power of two. We want power of two interleave count to simplify any
4925   // addressing operations or alignment considerations.
4926   // We also want power of two interleave counts to ensure that the induction
4927   // variable of the vector loop wraps to zero, when tail is folded by masking;
4928   // this currently happens when OptForSize, in which case IC is set to 1 above.
4929   unsigned IC = UINT_MAX;
4930 
4931   for (const auto &Pair : R.MaxLocalUsers) {
4932     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4933     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4934                       << " registers of "
4935                       << TTI.getRegisterClassName(Pair.first)
4936                       << " register class\n");
4937     if (VF.isScalar()) {
4938       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4939         TargetNumRegisters = ForceTargetNumScalarRegs;
4940     } else {
4941       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4942         TargetNumRegisters = ForceTargetNumVectorRegs;
4943     }
4944     unsigned MaxLocalUsers = Pair.second;
4945     unsigned LoopInvariantRegs = 0;
4946     if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
4947       LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4948 
4949     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4950                                      MaxLocalUsers);
4951     // Don't count the induction variable as interleaved.
4952     if (EnableIndVarRegisterHeur) {
4953       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4954                               std::max(1U, (MaxLocalUsers - 1)));
4955     }
4956 
4957     IC = std::min(IC, TmpIC);
4958   }
4959 
4960   // Clamp the interleave ranges to reasonable counts.
4961   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4962 
4963   // Check if the user has overridden the max.
4964   if (VF.isScalar()) {
4965     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4966       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4967   } else {
4968     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4969       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4970   }
4971 
4972   unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
4973   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4974   if (KnownTC > 0) {
4975     // At least one iteration must be scalar when this constraint holds. So the
4976     // maximum available iterations for interleaving is one less.
4977     unsigned AvailableTC =
4978         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4979 
4980     // If trip count is known we select between two prospective ICs, where
4981     // 1) the aggressive IC is capped by the trip count divided by VF
4982     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4983     // The final IC is selected in a way that the epilogue loop trip count is
4984     // minimized while maximizing the IC itself, so that we either run the
4985     // vector loop at least once if it generates a small epilogue loop, or else
4986     // we run the vector loop at least twice.
4987 
4988     unsigned InterleaveCountUB = bit_floor(
4989         std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4990     unsigned InterleaveCountLB = bit_floor(std::max(
4991         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4992     MaxInterleaveCount = InterleaveCountLB;
4993 
4994     if (InterleaveCountUB != InterleaveCountLB) {
4995       unsigned TailTripCountUB =
4996           (AvailableTC % (EstimatedVF * InterleaveCountUB));
4997       unsigned TailTripCountLB =
4998           (AvailableTC % (EstimatedVF * InterleaveCountLB));
4999       // If both produce same scalar tail, maximize the IC to do the same work
5000       // in fewer vector loop iterations
5001       if (TailTripCountUB == TailTripCountLB)
5002         MaxInterleaveCount = InterleaveCountUB;
5003     }
5004   } else if (BestKnownTC && *BestKnownTC > 0) {
5005     // At least one iteration must be scalar when this constraint holds. So the
5006     // maximum available iterations for interleaving is one less.
5007     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5008                                ? (*BestKnownTC) - 1
5009                                : *BestKnownTC;
5010 
5011     // If trip count is an estimated compile time constant, limit the
5012     // IC to be capped by the trip count divided by VF * 2, such that the vector
5013     // loop runs at least twice to make interleaving seem profitable when there
5014     // is an epilogue loop present. Since exact Trip count is not known we
5015     // choose to be conservative in our IC estimate.
5016     MaxInterleaveCount = bit_floor(std::max(
5017         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5018   }
5019 
5020   assert(MaxInterleaveCount > 0 &&
5021          "Maximum interleave count must be greater than 0");
5022 
5023   // Clamp the calculated IC to be between the 1 and the max interleave count
5024   // that the target and trip count allows.
5025   if (IC > MaxInterleaveCount)
5026     IC = MaxInterleaveCount;
5027   else
5028     // Make sure IC is greater than 0.
5029     IC = std::max(1u, IC);
5030 
5031   assert(IC > 0 && "Interleave count must be greater than 0.");
5032 
5033   // Interleave if we vectorized this loop and there is a reduction that could
5034   // benefit from interleaving.
5035   if (VF.isVector() && HasReductions) {
5036     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5037     return IC;
5038   }
5039 
5040   // For any scalar loop that either requires runtime checks or predication we
5041   // are better off leaving this to the unroller. Note that if we've already
5042   // vectorized the loop we will have done the runtime check and so interleaving
5043   // won't require further checks.
5044   bool ScalarInterleavingRequiresPredication =
5045       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5046          return Legal->blockNeedsPredication(BB);
5047        }));
5048   bool ScalarInterleavingRequiresRuntimePointerCheck =
5049       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5050 
5051   // We want to interleave small loops in order to reduce the loop overhead and
5052   // potentially expose ILP opportunities.
5053   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5054                     << "LV: IC is " << IC << '\n'
5055                     << "LV: VF is " << VF << '\n');
5056   const bool AggressivelyInterleaveReductions =
5057       TTI.enableAggressiveInterleaving(HasReductions);
5058   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5059       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5060     // We assume that the cost overhead is 1 and we use the cost model
5061     // to estimate the cost of the loop and interleave until the cost of the
5062     // loop overhead is about 5% of the cost of the loop.
5063     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5064                                         SmallLoopCost / *LoopCost.getValue()));
5065 
5066     // Interleave until store/load ports (estimated by max interleave count) are
5067     // saturated.
5068     unsigned NumStores = Legal->getNumStores();
5069     unsigned NumLoads = Legal->getNumLoads();
5070     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5071     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5072 
5073     // There is little point in interleaving for reductions containing selects
5074     // and compares when VF=1 since it may just create more overhead than it's
5075     // worth for loops with small trip counts. This is because we still have to
5076     // do the final reduction after the loop.
5077     bool HasSelectCmpReductions =
5078         HasReductions &&
5079         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5080           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5081           RecurKind RK = RdxDesc.getRecurrenceKind();
5082           return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5083                  RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5084         });
5085     if (HasSelectCmpReductions) {
5086       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5087       return 1;
5088     }
5089 
5090     // If we have a scalar reduction (vector reductions are already dealt with
5091     // by this point), we can increase the critical path length if the loop
5092     // we're interleaving is inside another loop. For tree-wise reductions
5093     // set the limit to 2, and for ordered reductions it's best to disable
5094     // interleaving entirely.
5095     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5096       bool HasOrderedReductions =
5097           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5098             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5099             return RdxDesc.isOrdered();
5100           });
5101       if (HasOrderedReductions) {
5102         LLVM_DEBUG(
5103             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5104         return 1;
5105       }
5106 
5107       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5108       SmallIC = std::min(SmallIC, F);
5109       StoresIC = std::min(StoresIC, F);
5110       LoadsIC = std::min(LoadsIC, F);
5111     }
5112 
5113     if (EnableLoadStoreRuntimeInterleave &&
5114         std::max(StoresIC, LoadsIC) > SmallIC) {
5115       LLVM_DEBUG(
5116           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5117       return std::max(StoresIC, LoadsIC);
5118     }
5119 
5120     // If there are scalar reductions and TTI has enabled aggressive
5121     // interleaving for reductions, we will interleave to expose ILP.
5122     if (VF.isScalar() && AggressivelyInterleaveReductions) {
5123       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5124       // Interleave no less than SmallIC but not as aggressive as the normal IC
5125       // to satisfy the rare situation when resources are too limited.
5126       return std::max(IC / 2, SmallIC);
5127     }
5128 
5129     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5130     return SmallIC;
5131   }
5132 
5133   // Interleave if this is a large loop (small loops are already dealt with by
5134   // this point) that could benefit from interleaving.
5135   if (AggressivelyInterleaveReductions) {
5136     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5137     return IC;
5138   }
5139 
5140   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5141   return 1;
5142 }
5143 
5144 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5145 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5146   // This function calculates the register usage by measuring the highest number
5147   // of values that are alive at a single location. Obviously, this is a very
5148   // rough estimation. We scan the loop in a topological order in order and
5149   // assign a number to each instruction. We use RPO to ensure that defs are
5150   // met before their users. We assume that each instruction that has in-loop
5151   // users starts an interval. We record every time that an in-loop value is
5152   // used, so we have a list of the first and last occurrences of each
5153   // instruction. Next, we transpose this data structure into a multi map that
5154   // holds the list of intervals that *end* at a specific location. This multi
5155   // map allows us to perform a linear search. We scan the instructions linearly
5156   // and record each time that a new interval starts, by placing it in a set.
5157   // If we find this value in the multi-map then we remove it from the set.
5158   // The max register usage is the maximum size of the set.
5159   // We also search for instructions that are defined outside the loop, but are
5160   // used inside the loop. We need this number separately from the max-interval
5161   // usage number because when we unroll, loop-invariant values do not take
5162   // more register.
5163   LoopBlocksDFS DFS(TheLoop);
5164   DFS.perform(LI);
5165 
5166   RegisterUsage RU;
5167 
5168   // Each 'key' in the map opens a new interval. The values
5169   // of the map are the index of the 'last seen' usage of the
5170   // instruction that is the key.
5171   using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5172 
5173   // Maps instruction to its index.
5174   SmallVector<Instruction *, 64> IdxToInstr;
5175   // Marks the end of each interval.
5176   IntervalMap EndPoint;
5177   // Saves the list of instruction indices that are used in the loop.
5178   SmallPtrSet<Instruction *, 8> Ends;
5179   // Saves the list of values that are used in the loop but are defined outside
5180   // the loop (not including non-instruction values such as arguments and
5181   // constants).
5182   SmallSetVector<Instruction *, 8> LoopInvariants;
5183 
5184   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5185     for (Instruction &I : BB->instructionsWithoutDebug()) {
5186       IdxToInstr.push_back(&I);
5187 
5188       // Save the end location of each USE.
5189       for (Value *U : I.operands()) {
5190         auto *Instr = dyn_cast<Instruction>(U);
5191 
5192         // Ignore non-instruction values such as arguments, constants, etc.
5193         // FIXME: Might need some motivation why these values are ignored. If
5194         // for example an argument is used inside the loop it will increase the
5195         // register pressure (so shouldn't we add it to LoopInvariants).
5196         if (!Instr)
5197           continue;
5198 
5199         // If this instruction is outside the loop then record it and continue.
5200         if (!TheLoop->contains(Instr)) {
5201           LoopInvariants.insert(Instr);
5202           continue;
5203         }
5204 
5205         // Overwrite previous end points.
5206         EndPoint[Instr] = IdxToInstr.size();
5207         Ends.insert(Instr);
5208       }
5209     }
5210   }
5211 
5212   // Saves the list of intervals that end with the index in 'key'.
5213   using InstrList = SmallVector<Instruction *, 2>;
5214   SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5215 
5216   // Transpose the EndPoints to a list of values that end at each index.
5217   for (auto &Interval : EndPoint)
5218     TransposeEnds[Interval.second].push_back(Interval.first);
5219 
5220   SmallPtrSet<Instruction *, 8> OpenIntervals;
5221   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5222   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5223 
5224   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5225 
5226   const auto &TTICapture = TTI;
5227   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5228     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5229         (VF.isScalable() &&
5230          !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5231       return 0;
5232     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5233   };
5234 
5235   for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5236     Instruction *I = IdxToInstr[Idx];
5237 
5238     // Remove all of the instructions that end at this location.
5239     InstrList &List = TransposeEnds[Idx];
5240     for (Instruction *ToRemove : List)
5241       OpenIntervals.erase(ToRemove);
5242 
5243     // Ignore instructions that are never used within the loop.
5244     if (!Ends.count(I))
5245       continue;
5246 
5247     // Skip ignored values.
5248     if (ValuesToIgnore.count(I))
5249       continue;
5250 
5251     collectInLoopReductions();
5252 
5253     // For each VF find the maximum usage of registers.
5254     for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5255       // Count the number of registers used, per register class, given all open
5256       // intervals.
5257       // Note that elements in this SmallMapVector will be default constructed
5258       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5259       // there is no previous entry for ClassID.
5260       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5261 
5262       if (VFs[J].isScalar()) {
5263         for (auto *Inst : OpenIntervals) {
5264           unsigned ClassID =
5265               TTI.getRegisterClassForType(false, Inst->getType());
5266           // FIXME: The target might use more than one register for the type
5267           // even in the scalar case.
5268           RegUsage[ClassID] += 1;
5269         }
5270       } else {
5271         collectUniformsAndScalars(VFs[J]);
5272         for (auto *Inst : OpenIntervals) {
5273           // Skip ignored values for VF > 1.
5274           if (VecValuesToIgnore.count(Inst))
5275             continue;
5276           if (isScalarAfterVectorization(Inst, VFs[J])) {
5277             unsigned ClassID =
5278                 TTI.getRegisterClassForType(false, Inst->getType());
5279             // FIXME: The target might use more than one register for the type
5280             // even in the scalar case.
5281             RegUsage[ClassID] += 1;
5282           } else {
5283             unsigned ClassID =
5284                 TTI.getRegisterClassForType(true, Inst->getType());
5285             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5286           }
5287         }
5288       }
5289 
5290       for (const auto &Pair : RegUsage) {
5291         auto &Entry = MaxUsages[J][Pair.first];
5292         Entry = std::max(Entry, Pair.second);
5293       }
5294     }
5295 
5296     LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5297                       << OpenIntervals.size() << '\n');
5298 
5299     // Add the current instruction to the list of open intervals.
5300     OpenIntervals.insert(I);
5301   }
5302 
5303   for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5304     // Note that elements in this SmallMapVector will be default constructed
5305     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5306     // there is no previous entry for ClassID.
5307     SmallMapVector<unsigned, unsigned, 4> Invariant;
5308 
5309     for (auto *Inst : LoopInvariants) {
5310       // FIXME: The target might use more than one register for the type
5311       // even in the scalar case.
5312       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5313         auto *I = cast<Instruction>(U);
5314         return TheLoop != LI->getLoopFor(I->getParent()) ||
5315                isScalarAfterVectorization(I, VFs[Idx]);
5316       });
5317 
5318       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5319       unsigned ClassID =
5320           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5321       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5322     }
5323 
5324     LLVM_DEBUG({
5325       dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5326       dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5327              << " item\n";
5328       for (const auto &pair : MaxUsages[Idx]) {
5329         dbgs() << "LV(REG): RegisterClass: "
5330                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5331                << " registers\n";
5332       }
5333       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5334              << " item\n";
5335       for (const auto &pair : Invariant) {
5336         dbgs() << "LV(REG): RegisterClass: "
5337                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5338                << " registers\n";
5339       }
5340     });
5341 
5342     RU.LoopInvariantRegs = Invariant;
5343     RU.MaxLocalUsers = MaxUsages[Idx];
5344     RUs[Idx] = RU;
5345   }
5346 
5347   return RUs;
5348 }
5349 
5350 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5351                                                            ElementCount VF) {
5352   // TODO: Cost model for emulated masked load/store is completely
5353   // broken. This hack guides the cost model to use an artificially
5354   // high enough value to practically disable vectorization with such
5355   // operations, except where previously deployed legality hack allowed
5356   // using very low cost values. This is to avoid regressions coming simply
5357   // from moving "masked load/store" check from legality to cost model.
5358   // Masked Load/Gather emulation was previously never allowed.
5359   // Limited number of Masked Store/Scatter emulation was allowed.
5360   assert((isPredicatedInst(I)) &&
5361          "Expecting a scalar emulated instruction");
5362   return isa<LoadInst>(I) ||
5363          (isa<StoreInst>(I) &&
5364           NumPredStores > NumberOfStoresToPredicate);
5365 }
5366 
5367 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5368   // If we aren't vectorizing the loop, or if we've already collected the
5369   // instructions to scalarize, there's nothing to do. Collection may already
5370   // have occurred if we have a user-selected VF and are now computing the
5371   // expected cost for interleaving.
5372   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5373     return;
5374 
5375   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5376   // not profitable to scalarize any instructions, the presence of VF in the
5377   // map will indicate that we've analyzed it already.
5378   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5379 
5380   PredicatedBBsAfterVectorization[VF].clear();
5381 
5382   // Find all the instructions that are scalar with predication in the loop and
5383   // determine if it would be better to not if-convert the blocks they are in.
5384   // If so, we also record the instructions to scalarize.
5385   for (BasicBlock *BB : TheLoop->blocks()) {
5386     if (!blockNeedsPredicationForAnyReason(BB))
5387       continue;
5388     for (Instruction &I : *BB)
5389       if (isScalarWithPredication(&I, VF)) {
5390         ScalarCostsTy ScalarCosts;
5391         // Do not apply discount logic for:
5392         // 1. Scalars after vectorization, as there will only be a single copy
5393         // of the instruction.
5394         // 2. Scalable VF, as that would lead to invalid scalarization costs.
5395         // 3. Emulated masked memrefs, if a hacked cost is needed.
5396         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5397             !useEmulatedMaskMemRefHack(&I, VF) &&
5398             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5399           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5400           // Check if we decided to scalarize a call. If so, update the widening
5401           // decision of the call to CM_Scalarize with the computed scalar cost.
5402           for (const auto &[I, _] : ScalarCosts) {
5403             auto *CI = dyn_cast<CallInst>(I);
5404             if (!CI || !CallWideningDecisions.contains({CI, VF}))
5405               continue;
5406             CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5407             CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5408           }
5409         }
5410         // Remember that BB will remain after vectorization.
5411         PredicatedBBsAfterVectorization[VF].insert(BB);
5412         for (auto *Pred : predecessors(BB)) {
5413           if (Pred->getSingleSuccessor() == BB)
5414             PredicatedBBsAfterVectorization[VF].insert(Pred);
5415         }
5416       }
5417   }
5418 }
5419 
5420 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5421     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5422   assert(!isUniformAfterVectorization(PredInst, VF) &&
5423          "Instruction marked uniform-after-vectorization will be predicated");
5424 
5425   // Initialize the discount to zero, meaning that the scalar version and the
5426   // vector version cost the same.
5427   InstructionCost Discount = 0;
5428 
5429   // Holds instructions to analyze. The instructions we visit are mapped in
5430   // ScalarCosts. Those instructions are the ones that would be scalarized if
5431   // we find that the scalar version costs less.
5432   SmallVector<Instruction *, 8> Worklist;
5433 
5434   // Returns true if the given instruction can be scalarized.
5435   auto CanBeScalarized = [&](Instruction *I) -> bool {
5436     // We only attempt to scalarize instructions forming a single-use chain
5437     // from the original predicated block that would otherwise be vectorized.
5438     // Although not strictly necessary, we give up on instructions we know will
5439     // already be scalar to avoid traversing chains that are unlikely to be
5440     // beneficial.
5441     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5442         isScalarAfterVectorization(I, VF))
5443       return false;
5444 
5445     // If the instruction is scalar with predication, it will be analyzed
5446     // separately. We ignore it within the context of PredInst.
5447     if (isScalarWithPredication(I, VF))
5448       return false;
5449 
5450     // If any of the instruction's operands are uniform after vectorization,
5451     // the instruction cannot be scalarized. This prevents, for example, a
5452     // masked load from being scalarized.
5453     //
5454     // We assume we will only emit a value for lane zero of an instruction
5455     // marked uniform after vectorization, rather than VF identical values.
5456     // Thus, if we scalarize an instruction that uses a uniform, we would
5457     // create uses of values corresponding to the lanes we aren't emitting code
5458     // for. This behavior can be changed by allowing getScalarValue to clone
5459     // the lane zero values for uniforms rather than asserting.
5460     for (Use &U : I->operands())
5461       if (auto *J = dyn_cast<Instruction>(U.get()))
5462         if (isUniformAfterVectorization(J, VF))
5463           return false;
5464 
5465     // Otherwise, we can scalarize the instruction.
5466     return true;
5467   };
5468 
5469   // Compute the expected cost discount from scalarizing the entire expression
5470   // feeding the predicated instruction. We currently only consider expressions
5471   // that are single-use instruction chains.
5472   Worklist.push_back(PredInst);
5473   while (!Worklist.empty()) {
5474     Instruction *I = Worklist.pop_back_val();
5475 
5476     // If we've already analyzed the instruction, there's nothing to do.
5477     if (ScalarCosts.contains(I))
5478       continue;
5479 
5480     // Compute the cost of the vector instruction. Note that this cost already
5481     // includes the scalarization overhead of the predicated instruction.
5482     InstructionCost VectorCost = getInstructionCost(I, VF);
5483 
5484     // Compute the cost of the scalarized instruction. This cost is the cost of
5485     // the instruction as if it wasn't if-converted and instead remained in the
5486     // predicated block. We will scale this cost by block probability after
5487     // computing the scalarization overhead.
5488     InstructionCost ScalarCost =
5489         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5490 
5491     // Compute the scalarization overhead of needed insertelement instructions
5492     // and phi nodes.
5493     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5494       ScalarCost += TTI.getScalarizationOverhead(
5495           cast<VectorType>(toVectorTy(I->getType(), VF)),
5496           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5497           /*Extract*/ false, CostKind);
5498       ScalarCost +=
5499           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5500     }
5501 
5502     // Compute the scalarization overhead of needed extractelement
5503     // instructions. For each of the instruction's operands, if the operand can
5504     // be scalarized, add it to the worklist; otherwise, account for the
5505     // overhead.
5506     for (Use &U : I->operands())
5507       if (auto *J = dyn_cast<Instruction>(U.get())) {
5508         assert(VectorType::isValidElementType(J->getType()) &&
5509                "Instruction has non-scalar type");
5510         if (CanBeScalarized(J))
5511           Worklist.push_back(J);
5512         else if (needsExtract(J, VF)) {
5513           ScalarCost += TTI.getScalarizationOverhead(
5514               cast<VectorType>(toVectorTy(J->getType(), VF)),
5515               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5516               /*Extract*/ true, CostKind);
5517         }
5518       }
5519 
5520     // Scale the total scalar cost by block probability.
5521     ScalarCost /= getReciprocalPredBlockProb();
5522 
5523     // Compute the discount. A non-negative discount means the vector version
5524     // of the instruction costs more, and scalarizing would be beneficial.
5525     Discount += VectorCost - ScalarCost;
5526     ScalarCosts[I] = ScalarCost;
5527   }
5528 
5529   return Discount;
5530 }
5531 
5532 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5533   InstructionCost Cost;
5534 
5535   // If the vector loop gets executed exactly once with the given VF, ignore the
5536   // costs of comparison and induction instructions, as they'll get simplified
5537   // away.
5538   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5539   auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5540   if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5541     addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5542                                          ValuesToIgnoreForVF);
5543 
5544   // For each block.
5545   for (BasicBlock *BB : TheLoop->blocks()) {
5546     InstructionCost BlockCost;
5547 
5548     // For each instruction in the old loop.
5549     for (Instruction &I : BB->instructionsWithoutDebug()) {
5550       // Skip ignored values.
5551       if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5552           (VF.isVector() && VecValuesToIgnore.count(&I)))
5553         continue;
5554 
5555       InstructionCost C = getInstructionCost(&I, VF);
5556 
5557       // Check if we should override the cost.
5558       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5559         C = InstructionCost(ForceTargetInstructionCost);
5560 
5561       BlockCost += C;
5562       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5563                         << VF << " For instruction: " << I << '\n');
5564     }
5565 
5566     // If we are vectorizing a predicated block, it will have been
5567     // if-converted. This means that the block's instructions (aside from
5568     // stores and instructions that may divide by zero) will now be
5569     // unconditionally executed. For the scalar case, we may not always execute
5570     // the predicated block, if it is an if-else block. Thus, scale the block's
5571     // cost by the probability of executing it. blockNeedsPredication from
5572     // Legal is used so as to not include all blocks in tail folded loops.
5573     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5574       BlockCost /= getReciprocalPredBlockProb();
5575 
5576     Cost += BlockCost;
5577   }
5578 
5579   return Cost;
5580 }
5581 
5582 /// Gets Address Access SCEV after verifying that the access pattern
5583 /// is loop invariant except the induction variable dependence.
5584 ///
5585 /// This SCEV can be sent to the Target in order to estimate the address
5586 /// calculation cost.
5587 static const SCEV *getAddressAccessSCEV(
5588               Value *Ptr,
5589               LoopVectorizationLegality *Legal,
5590               PredicatedScalarEvolution &PSE,
5591               const Loop *TheLoop) {
5592 
5593   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5594   if (!Gep)
5595     return nullptr;
5596 
5597   // We are looking for a gep with all loop invariant indices except for one
5598   // which should be an induction variable.
5599   auto *SE = PSE.getSE();
5600   unsigned NumOperands = Gep->getNumOperands();
5601   for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5602     Value *Opd = Gep->getOperand(Idx);
5603     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5604         !Legal->isInductionVariable(Opd))
5605       return nullptr;
5606   }
5607 
5608   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5609   return PSE.getSCEV(Ptr);
5610 }
5611 
5612 InstructionCost
5613 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5614                                                         ElementCount VF) {
5615   assert(VF.isVector() &&
5616          "Scalarization cost of instruction implies vectorization.");
5617   if (VF.isScalable())
5618     return InstructionCost::getInvalid();
5619 
5620   Type *ValTy = getLoadStoreType(I);
5621   auto *SE = PSE.getSE();
5622 
5623   unsigned AS = getLoadStoreAddressSpace(I);
5624   Value *Ptr = getLoadStorePointerOperand(I);
5625   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5626   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5627   //       that it is being called from this specific place.
5628 
5629   // Figure out whether the access is strided and get the stride value
5630   // if it's known in compile time
5631   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5632 
5633   // Get the cost of the scalar memory instruction and address computation.
5634   InstructionCost Cost =
5635       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5636 
5637   // Don't pass *I here, since it is scalar but will actually be part of a
5638   // vectorized loop where the user of it is a vectorized instruction.
5639   const Align Alignment = getLoadStoreAlignment(I);
5640   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5641                                                       ValTy->getScalarType(),
5642                                                       Alignment, AS, CostKind);
5643 
5644   // Get the overhead of the extractelement and insertelement instructions
5645   // we might create due to scalarization.
5646   Cost += getScalarizationOverhead(I, VF);
5647 
5648   // If we have a predicated load/store, it will need extra i1 extracts and
5649   // conditional branches, but may not be executed for each vector lane. Scale
5650   // the cost by the probability of executing the predicated block.
5651   if (isPredicatedInst(I)) {
5652     Cost /= getReciprocalPredBlockProb();
5653 
5654     // Add the cost of an i1 extract and a branch
5655     auto *VecI1Ty =
5656         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5657     Cost += TTI.getScalarizationOverhead(
5658         VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5659         /*Insert=*/false, /*Extract=*/true, CostKind);
5660     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5661 
5662     if (useEmulatedMaskMemRefHack(I, VF))
5663       // Artificially setting to a high enough value to practically disable
5664       // vectorization with such operations.
5665       Cost = 3000000;
5666   }
5667 
5668   return Cost;
5669 }
5670 
5671 InstructionCost
5672 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5673                                                     ElementCount VF) {
5674   Type *ValTy = getLoadStoreType(I);
5675   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5676   Value *Ptr = getLoadStorePointerOperand(I);
5677   unsigned AS = getLoadStoreAddressSpace(I);
5678   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5679 
5680   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5681          "Stride should be 1 or -1 for consecutive memory access");
5682   const Align Alignment = getLoadStoreAlignment(I);
5683   InstructionCost Cost = 0;
5684   if (Legal->isMaskRequired(I)) {
5685     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5686                                       CostKind);
5687   } else {
5688     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5689     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5690                                 CostKind, OpInfo, I);
5691   }
5692 
5693   bool Reverse = ConsecutiveStride < 0;
5694   if (Reverse)
5695     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5696                                CostKind, 0);
5697   return Cost;
5698 }
5699 
5700 InstructionCost
5701 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5702                                                 ElementCount VF) {
5703   assert(Legal->isUniformMemOp(*I, VF));
5704 
5705   Type *ValTy = getLoadStoreType(I);
5706   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5707   const Align Alignment = getLoadStoreAlignment(I);
5708   unsigned AS = getLoadStoreAddressSpace(I);
5709   if (isa<LoadInst>(I)) {
5710     return TTI.getAddressComputationCost(ValTy) +
5711            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5712                                CostKind) +
5713            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
5714                               CostKind);
5715   }
5716   StoreInst *SI = cast<StoreInst>(I);
5717 
5718   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5719   return TTI.getAddressComputationCost(ValTy) +
5720          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5721                              CostKind) +
5722          (IsLoopInvariantStoreValue
5723               ? 0
5724               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5725                                        CostKind, VF.getKnownMinValue() - 1));
5726 }
5727 
5728 InstructionCost
5729 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5730                                                  ElementCount VF) {
5731   Type *ValTy = getLoadStoreType(I);
5732   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5733   const Align Alignment = getLoadStoreAlignment(I);
5734   const Value *Ptr = getLoadStorePointerOperand(I);
5735 
5736   return TTI.getAddressComputationCost(VectorTy) +
5737          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5738                                     Legal->isMaskRequired(I), Alignment,
5739                                     CostKind, I);
5740 }
5741 
5742 InstructionCost
5743 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5744                                                    ElementCount VF) {
5745   const auto *Group = getInterleavedAccessGroup(I);
5746   assert(Group && "Fail to get an interleaved access group.");
5747 
5748   Instruction *InsertPos = Group->getInsertPos();
5749   Type *ValTy = getLoadStoreType(InsertPos);
5750   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5751   unsigned AS = getLoadStoreAddressSpace(InsertPos);
5752 
5753   unsigned InterleaveFactor = Group->getFactor();
5754   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5755 
5756   // Holds the indices of existing members in the interleaved group.
5757   SmallVector<unsigned, 4> Indices;
5758   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5759     if (Group->getMember(IF))
5760       Indices.push_back(IF);
5761 
5762   // Calculate the cost of the whole interleaved group.
5763   bool UseMaskForGaps =
5764       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5765       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5766   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5767       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5768       Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5769       UseMaskForGaps);
5770 
5771   if (Group->isReverse()) {
5772     // TODO: Add support for reversed masked interleaved access.
5773     assert(!Legal->isMaskRequired(I) &&
5774            "Reverse masked interleaved access not supported.");
5775     Cost += Group->getNumMembers() *
5776             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5777                                CostKind, 0);
5778   }
5779   return Cost;
5780 }
5781 
5782 std::optional<InstructionCost>
5783 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5784                                                     ElementCount VF,
5785                                                     Type *Ty) const {
5786   using namespace llvm::PatternMatch;
5787   // Early exit for no inloop reductions
5788   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5789     return std::nullopt;
5790   auto *VectorTy = cast<VectorType>(Ty);
5791 
5792   // We are looking for a pattern of, and finding the minimal acceptable cost:
5793   //  reduce(mul(ext(A), ext(B))) or
5794   //  reduce(mul(A, B)) or
5795   //  reduce(ext(A)) or
5796   //  reduce(A).
5797   // The basic idea is that we walk down the tree to do that, finding the root
5798   // reduction instruction in InLoopReductionImmediateChains. From there we find
5799   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5800   // of the components. If the reduction cost is lower then we return it for the
5801   // reduction instruction and 0 for the other instructions in the pattern. If
5802   // it is not we return an invalid cost specifying the orignal cost method
5803   // should be used.
5804   Instruction *RetI = I;
5805   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5806     if (!RetI->hasOneUser())
5807       return std::nullopt;
5808     RetI = RetI->user_back();
5809   }
5810 
5811   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5812       RetI->user_back()->getOpcode() == Instruction::Add) {
5813     RetI = RetI->user_back();
5814   }
5815 
5816   // Test if the found instruction is a reduction, and if not return an invalid
5817   // cost specifying the parent to use the original cost modelling.
5818   if (!InLoopReductionImmediateChains.count(RetI))
5819     return std::nullopt;
5820 
5821   // Find the reduction this chain is a part of and calculate the basic cost of
5822   // the reduction on its own.
5823   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5824   Instruction *ReductionPhi = LastChain;
5825   while (!isa<PHINode>(ReductionPhi))
5826     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5827 
5828   const RecurrenceDescriptor &RdxDesc =
5829       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5830 
5831   InstructionCost BaseCost;
5832   RecurKind RK = RdxDesc.getRecurrenceKind();
5833   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5834     Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5835     BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5836                                           RdxDesc.getFastMathFlags(), CostKind);
5837   } else {
5838     BaseCost = TTI.getArithmeticReductionCost(
5839         RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5840   }
5841 
5842   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5843   // normal fmul instruction to the cost of the fadd reduction.
5844   if (RK == RecurKind::FMulAdd)
5845     BaseCost +=
5846         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5847 
5848   // If we're using ordered reductions then we can just return the base cost
5849   // here, since getArithmeticReductionCost calculates the full ordered
5850   // reduction cost when FP reassociation is not allowed.
5851   if (useOrderedReductions(RdxDesc))
5852     return BaseCost;
5853 
5854   // Get the operand that was not the reduction chain and match it to one of the
5855   // patterns, returning the better cost if it is found.
5856   Instruction *RedOp = RetI->getOperand(1) == LastChain
5857                            ? dyn_cast<Instruction>(RetI->getOperand(0))
5858                            : dyn_cast<Instruction>(RetI->getOperand(1));
5859 
5860   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5861 
5862   Instruction *Op0, *Op1;
5863   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5864       match(RedOp,
5865             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5866       match(Op0, m_ZExtOrSExt(m_Value())) &&
5867       Op0->getOpcode() == Op1->getOpcode() &&
5868       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5869       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5870       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5871 
5872     // Matched reduce.add(ext(mul(ext(A), ext(B)))
5873     // Note that the extend opcodes need to all match, or if A==B they will have
5874     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5875     // which is equally fine.
5876     bool IsUnsigned = isa<ZExtInst>(Op0);
5877     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5878     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5879 
5880     InstructionCost ExtCost =
5881         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5882                              TTI::CastContextHint::None, CostKind, Op0);
5883     InstructionCost MulCost =
5884         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5885     InstructionCost Ext2Cost =
5886         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5887                              TTI::CastContextHint::None, CostKind, RedOp);
5888 
5889     InstructionCost RedCost = TTI.getMulAccReductionCost(
5890         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5891 
5892     if (RedCost.isValid() &&
5893         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5894       return I == RetI ? RedCost : 0;
5895   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5896              !TheLoop->isLoopInvariant(RedOp)) {
5897     // Matched reduce(ext(A))
5898     bool IsUnsigned = isa<ZExtInst>(RedOp);
5899     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5900     InstructionCost RedCost = TTI.getExtendedReductionCost(
5901         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5902         RdxDesc.getFastMathFlags(), CostKind);
5903 
5904     InstructionCost ExtCost =
5905         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5906                              TTI::CastContextHint::None, CostKind, RedOp);
5907     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5908       return I == RetI ? RedCost : 0;
5909   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5910              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5911     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5912         Op0->getOpcode() == Op1->getOpcode() &&
5913         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5914       bool IsUnsigned = isa<ZExtInst>(Op0);
5915       Type *Op0Ty = Op0->getOperand(0)->getType();
5916       Type *Op1Ty = Op1->getOperand(0)->getType();
5917       Type *LargestOpTy =
5918           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5919                                                                     : Op0Ty;
5920       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5921 
5922       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5923       // different sizes. We take the largest type as the ext to reduce, and add
5924       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5925       InstructionCost ExtCost0 = TTI.getCastInstrCost(
5926           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5927           TTI::CastContextHint::None, CostKind, Op0);
5928       InstructionCost ExtCost1 = TTI.getCastInstrCost(
5929           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5930           TTI::CastContextHint::None, CostKind, Op1);
5931       InstructionCost MulCost =
5932           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5933 
5934       InstructionCost RedCost = TTI.getMulAccReductionCost(
5935           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5936       InstructionCost ExtraExtCost = 0;
5937       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5938         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5939         ExtraExtCost = TTI.getCastInstrCost(
5940             ExtraExtOp->getOpcode(), ExtType,
5941             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5942             TTI::CastContextHint::None, CostKind, ExtraExtOp);
5943       }
5944 
5945       if (RedCost.isValid() &&
5946           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5947         return I == RetI ? RedCost : 0;
5948     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5949       // Matched reduce.add(mul())
5950       InstructionCost MulCost =
5951           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5952 
5953       InstructionCost RedCost = TTI.getMulAccReductionCost(
5954           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5955 
5956       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5957         return I == RetI ? RedCost : 0;
5958     }
5959   }
5960 
5961   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5962 }
5963 
5964 InstructionCost
5965 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5966                                                      ElementCount VF) {
5967   // Calculate scalar cost only. Vectorization cost should be ready at this
5968   // moment.
5969   if (VF.isScalar()) {
5970     Type *ValTy = getLoadStoreType(I);
5971     const Align Alignment = getLoadStoreAlignment(I);
5972     unsigned AS = getLoadStoreAddressSpace(I);
5973 
5974     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5975     return TTI.getAddressComputationCost(ValTy) +
5976            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5977                                OpInfo, I);
5978   }
5979   return getWideningCost(I, VF);
5980 }
5981 
5982 InstructionCost
5983 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5984                                                      ElementCount VF) const {
5985 
5986   // There is no mechanism yet to create a scalable scalarization loop,
5987   // so this is currently Invalid.
5988   if (VF.isScalable())
5989     return InstructionCost::getInvalid();
5990 
5991   if (VF.isScalar())
5992     return 0;
5993 
5994   InstructionCost Cost = 0;
5995   Type *RetTy = toVectorTy(I->getType(), VF);
5996   if (!RetTy->isVoidTy() &&
5997       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5998     Cost += TTI.getScalarizationOverhead(
5999         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6000         /*Insert*/ true,
6001         /*Extract*/ false, CostKind);
6002 
6003   // Some targets keep addresses scalar.
6004   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6005     return Cost;
6006 
6007   // Some targets support efficient element stores.
6008   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6009     return Cost;
6010 
6011   // Collect operands to consider.
6012   CallInst *CI = dyn_cast<CallInst>(I);
6013   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6014 
6015   // Skip operands that do not require extraction/scalarization and do not incur
6016   // any overhead.
6017   SmallVector<Type *> Tys;
6018   for (auto *V : filterExtractingOperands(Ops, VF))
6019     Tys.push_back(maybeVectorizeType(V->getType(), VF));
6020   return Cost + TTI.getOperandsScalarizationOverhead(
6021                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6022 }
6023 
6024 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6025   if (VF.isScalar())
6026     return;
6027   NumPredStores = 0;
6028   for (BasicBlock *BB : TheLoop->blocks()) {
6029     // For each instruction in the old loop.
6030     for (Instruction &I : *BB) {
6031       Value *Ptr =  getLoadStorePointerOperand(&I);
6032       if (!Ptr)
6033         continue;
6034 
6035       // TODO: We should generate better code and update the cost model for
6036       // predicated uniform stores. Today they are treated as any other
6037       // predicated store (see added test cases in
6038       // invariant-store-vectorization.ll).
6039       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6040         NumPredStores++;
6041 
6042       if (Legal->isUniformMemOp(I, VF)) {
6043         auto IsLegalToScalarize = [&]() {
6044           if (!VF.isScalable())
6045             // Scalarization of fixed length vectors "just works".
6046             return true;
6047 
6048           // We have dedicated lowering for unpredicated uniform loads and
6049           // stores.  Note that even with tail folding we know that at least
6050           // one lane is active (i.e. generalized predication is not possible
6051           // here), and the logic below depends on this fact.
6052           if (!foldTailByMasking())
6053             return true;
6054 
6055           // For scalable vectors, a uniform memop load is always
6056           // uniform-by-parts  and we know how to scalarize that.
6057           if (isa<LoadInst>(I))
6058             return true;
6059 
6060           // A uniform store isn't neccessarily uniform-by-part
6061           // and we can't assume scalarization.
6062           auto &SI = cast<StoreInst>(I);
6063           return TheLoop->isLoopInvariant(SI.getValueOperand());
6064         };
6065 
6066         const InstructionCost GatherScatterCost =
6067           isLegalGatherOrScatter(&I, VF) ?
6068           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6069 
6070         // Load: Scalar load + broadcast
6071         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6072         // FIXME: This cost is a significant under-estimate for tail folded
6073         // memory ops.
6074         const InstructionCost ScalarizationCost =
6075             IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6076                                  : InstructionCost::getInvalid();
6077 
6078         // Choose better solution for the current VF,  Note that Invalid
6079         // costs compare as maximumal large.  If both are invalid, we get
6080         // scalable invalid which signals a failure and a vectorization abort.
6081         if (GatherScatterCost < ScalarizationCost)
6082           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6083         else
6084           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6085         continue;
6086       }
6087 
6088       // We assume that widening is the best solution when possible.
6089       if (memoryInstructionCanBeWidened(&I, VF)) {
6090         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6091         int ConsecutiveStride = Legal->isConsecutivePtr(
6092             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6093         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6094                "Expected consecutive stride.");
6095         InstWidening Decision =
6096             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6097         setWideningDecision(&I, VF, Decision, Cost);
6098         continue;
6099       }
6100 
6101       // Choose between Interleaving, Gather/Scatter or Scalarization.
6102       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6103       unsigned NumAccesses = 1;
6104       if (isAccessInterleaved(&I)) {
6105         const auto *Group = getInterleavedAccessGroup(&I);
6106         assert(Group && "Fail to get an interleaved access group.");
6107 
6108         // Make one decision for the whole group.
6109         if (getWideningDecision(&I, VF) != CM_Unknown)
6110           continue;
6111 
6112         NumAccesses = Group->getNumMembers();
6113         if (interleavedAccessCanBeWidened(&I, VF))
6114           InterleaveCost = getInterleaveGroupCost(&I, VF);
6115       }
6116 
6117       InstructionCost GatherScatterCost =
6118           isLegalGatherOrScatter(&I, VF)
6119               ? getGatherScatterCost(&I, VF) * NumAccesses
6120               : InstructionCost::getInvalid();
6121 
6122       InstructionCost ScalarizationCost =
6123           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6124 
6125       // Choose better solution for the current VF,
6126       // write down this decision and use it during vectorization.
6127       InstructionCost Cost;
6128       InstWidening Decision;
6129       if (InterleaveCost <= GatherScatterCost &&
6130           InterleaveCost < ScalarizationCost) {
6131         Decision = CM_Interleave;
6132         Cost = InterleaveCost;
6133       } else if (GatherScatterCost < ScalarizationCost) {
6134         Decision = CM_GatherScatter;
6135         Cost = GatherScatterCost;
6136       } else {
6137         Decision = CM_Scalarize;
6138         Cost = ScalarizationCost;
6139       }
6140       // If the instructions belongs to an interleave group, the whole group
6141       // receives the same decision. The whole group receives the cost, but
6142       // the cost will actually be assigned to one instruction.
6143       if (const auto *Group = getInterleavedAccessGroup(&I))
6144         setWideningDecision(Group, VF, Decision, Cost);
6145       else
6146         setWideningDecision(&I, VF, Decision, Cost);
6147     }
6148   }
6149 
6150   // Make sure that any load of address and any other address computation
6151   // remains scalar unless there is gather/scatter support. This avoids
6152   // inevitable extracts into address registers, and also has the benefit of
6153   // activating LSR more, since that pass can't optimize vectorized
6154   // addresses.
6155   if (TTI.prefersVectorizedAddressing())
6156     return;
6157 
6158   // Start with all scalar pointer uses.
6159   SmallPtrSet<Instruction *, 8> AddrDefs;
6160   for (BasicBlock *BB : TheLoop->blocks())
6161     for (Instruction &I : *BB) {
6162       Instruction *PtrDef =
6163         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6164       if (PtrDef && TheLoop->contains(PtrDef) &&
6165           getWideningDecision(&I, VF) != CM_GatherScatter)
6166         AddrDefs.insert(PtrDef);
6167     }
6168 
6169   // Add all instructions used to generate the addresses.
6170   SmallVector<Instruction *, 4> Worklist;
6171   append_range(Worklist, AddrDefs);
6172   while (!Worklist.empty()) {
6173     Instruction *I = Worklist.pop_back_val();
6174     for (auto &Op : I->operands())
6175       if (auto *InstOp = dyn_cast<Instruction>(Op))
6176         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6177             AddrDefs.insert(InstOp).second)
6178           Worklist.push_back(InstOp);
6179   }
6180 
6181   for (auto *I : AddrDefs) {
6182     if (isa<LoadInst>(I)) {
6183       // Setting the desired widening decision should ideally be handled in
6184       // by cost functions, but since this involves the task of finding out
6185       // if the loaded register is involved in an address computation, it is
6186       // instead changed here when we know this is the case.
6187       InstWidening Decision = getWideningDecision(I, VF);
6188       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6189         // Scalarize a widened load of address.
6190         setWideningDecision(
6191             I, VF, CM_Scalarize,
6192             (VF.getKnownMinValue() *
6193              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6194       else if (const auto *Group = getInterleavedAccessGroup(I)) {
6195         // Scalarize an interleave group of address loads.
6196         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6197           if (Instruction *Member = Group->getMember(I))
6198             setWideningDecision(
6199                 Member, VF, CM_Scalarize,
6200                 (VF.getKnownMinValue() *
6201                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6202         }
6203       }
6204     } else
6205       // Make sure I gets scalarized and a cost estimate without
6206       // scalarization overhead.
6207       ForcedScalars[VF].insert(I);
6208   }
6209 }
6210 
6211 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6212   assert(!VF.isScalar() &&
6213          "Trying to set a vectorization decision for a scalar VF");
6214 
6215   auto ForcedScalar = ForcedScalars.find(VF);
6216   for (BasicBlock *BB : TheLoop->blocks()) {
6217     // For each instruction in the old loop.
6218     for (Instruction &I : *BB) {
6219       CallInst *CI = dyn_cast<CallInst>(&I);
6220 
6221       if (!CI)
6222         continue;
6223 
6224       InstructionCost ScalarCost = InstructionCost::getInvalid();
6225       InstructionCost VectorCost = InstructionCost::getInvalid();
6226       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6227       Function *ScalarFunc = CI->getCalledFunction();
6228       Type *ScalarRetTy = CI->getType();
6229       SmallVector<Type *, 4> Tys, ScalarTys;
6230       for (auto &ArgOp : CI->args())
6231         ScalarTys.push_back(ArgOp->getType());
6232 
6233       // Estimate cost of scalarized vector call. The source operands are
6234       // assumed to be vectors, so we need to extract individual elements from
6235       // there, execute VF scalar calls, and then gather the result into the
6236       // vector return value.
6237       InstructionCost ScalarCallCost =
6238           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6239 
6240       // Compute costs of unpacking argument values for the scalar calls and
6241       // packing the return values to a vector.
6242       InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
6243 
6244       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6245       // Honor ForcedScalars and UniformAfterVectorization decisions.
6246       // TODO: For calls, it might still be more profitable to widen. Use
6247       // VPlan-based cost model to compare different options.
6248       if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6249                              ForcedScalar->second.contains(CI)) ||
6250                             isUniformAfterVectorization(CI, VF))) {
6251         setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6252                                 Intrinsic::not_intrinsic, std::nullopt,
6253                                 ScalarCost);
6254         continue;
6255       }
6256 
6257       bool MaskRequired = Legal->isMaskRequired(CI);
6258       // Compute corresponding vector type for return value and arguments.
6259       Type *RetTy = toVectorTy(ScalarRetTy, VF);
6260       for (Type *ScalarTy : ScalarTys)
6261         Tys.push_back(toVectorTy(ScalarTy, VF));
6262 
6263       // An in-loop reduction using an fmuladd intrinsic is a special case;
6264       // we don't want the normal cost for that intrinsic.
6265       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6266         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
6267           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6268                                   getVectorIntrinsicIDForCall(CI, TLI),
6269                                   std::nullopt, *RedCost);
6270           continue;
6271         }
6272 
6273       // Find the cost of vectorizing the call, if we can find a suitable
6274       // vector variant of the function.
6275       bool UsesMask = false;
6276       VFInfo FuncInfo;
6277       Function *VecFunc = nullptr;
6278       // Search through any available variants for one we can use at this VF.
6279       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6280         // Must match requested VF.
6281         if (Info.Shape.VF != VF)
6282           continue;
6283 
6284         // Must take a mask argument if one is required
6285         if (MaskRequired && !Info.isMasked())
6286           continue;
6287 
6288         // Check that all parameter kinds are supported
6289         bool ParamsOk = true;
6290         for (VFParameter Param : Info.Shape.Parameters) {
6291           switch (Param.ParamKind) {
6292           case VFParamKind::Vector:
6293             break;
6294           case VFParamKind::OMP_Uniform: {
6295             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6296             // Make sure the scalar parameter in the loop is invariant.
6297             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6298                                               TheLoop))
6299               ParamsOk = false;
6300             break;
6301           }
6302           case VFParamKind::OMP_Linear: {
6303             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6304             // Find the stride for the scalar parameter in this loop and see if
6305             // it matches the stride for the variant.
6306             // TODO: do we need to figure out the cost of an extract to get the
6307             // first lane? Or do we hope that it will be folded away?
6308             ScalarEvolution *SE = PSE.getSE();
6309             const auto *SAR =
6310                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6311 
6312             if (!SAR || SAR->getLoop() != TheLoop) {
6313               ParamsOk = false;
6314               break;
6315             }
6316 
6317             const SCEVConstant *Step =
6318                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6319 
6320             if (!Step ||
6321                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6322               ParamsOk = false;
6323 
6324             break;
6325           }
6326           case VFParamKind::GlobalPredicate:
6327             UsesMask = true;
6328             break;
6329           default:
6330             ParamsOk = false;
6331             break;
6332           }
6333         }
6334 
6335         if (!ParamsOk)
6336           continue;
6337 
6338         // Found a suitable candidate, stop here.
6339         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6340         FuncInfo = Info;
6341         break;
6342       }
6343 
6344       // Add in the cost of synthesizing a mask if one wasn't required.
6345       InstructionCost MaskCost = 0;
6346       if (VecFunc && UsesMask && !MaskRequired)
6347         MaskCost = TTI.getShuffleCost(
6348             TargetTransformInfo::SK_Broadcast,
6349             VectorType::get(IntegerType::getInt1Ty(
6350                                 VecFunc->getFunctionType()->getContext()),
6351                             VF),
6352             {}, CostKind);
6353 
6354       if (TLI && VecFunc && !CI->isNoBuiltin())
6355         VectorCost =
6356             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6357 
6358       // Find the cost of an intrinsic; some targets may have instructions that
6359       // perform the operation without needing an actual call.
6360       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6361       if (IID != Intrinsic::not_intrinsic)
6362         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6363 
6364       InstructionCost Cost = ScalarCost;
6365       InstWidening Decision = CM_Scalarize;
6366 
6367       if (VectorCost <= Cost) {
6368         Cost = VectorCost;
6369         Decision = CM_VectorCall;
6370       }
6371 
6372       if (IntrinsicCost <= Cost) {
6373         Cost = IntrinsicCost;
6374         Decision = CM_IntrinsicCall;
6375       }
6376 
6377       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6378                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6379     }
6380   }
6381 }
6382 
6383 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6384   if (!Legal->isInvariant(Op))
6385     return false;
6386   // Consider Op invariant, if it or its operands aren't predicated
6387   // instruction in the loop. In that case, it is not trivially hoistable.
6388   auto *OpI = dyn_cast<Instruction>(Op);
6389   return !OpI || !TheLoop->contains(OpI) ||
6390          (!isPredicatedInst(OpI) &&
6391           (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6392           all_of(OpI->operands(),
6393                  [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6394 }
6395 
6396 InstructionCost
6397 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6398                                                ElementCount VF) {
6399   // If we know that this instruction will remain uniform, check the cost of
6400   // the scalar version.
6401   if (isUniformAfterVectorization(I, VF))
6402     VF = ElementCount::getFixed(1);
6403 
6404   if (VF.isVector() && isProfitableToScalarize(I, VF))
6405     return InstsToScalarize[VF][I];
6406 
6407   // Forced scalars do not have any scalarization overhead.
6408   auto ForcedScalar = ForcedScalars.find(VF);
6409   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6410     auto InstSet = ForcedScalar->second;
6411     if (InstSet.count(I))
6412       return getInstructionCost(I, ElementCount::getFixed(1)) *
6413              VF.getKnownMinValue();
6414   }
6415 
6416   Type *RetTy = I->getType();
6417   if (canTruncateToMinimalBitwidth(I, VF))
6418     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6419   auto *SE = PSE.getSE();
6420 
6421   auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6422                                                 ElementCount VF) -> bool {
6423     if (VF.isScalar())
6424       return true;
6425 
6426     auto Scalarized = InstsToScalarize.find(VF);
6427     assert(Scalarized != InstsToScalarize.end() &&
6428            "VF not yet analyzed for scalarization profitability");
6429     return !Scalarized->second.count(I) &&
6430            llvm::all_of(I->users(), [&](User *U) {
6431              auto *UI = cast<Instruction>(U);
6432              return !Scalarized->second.count(UI);
6433            });
6434   };
6435   (void)HasSingleCopyAfterVectorization;
6436 
6437   Type *VectorTy;
6438   if (isScalarAfterVectorization(I, VF)) {
6439     // With the exception of GEPs and PHIs, after scalarization there should
6440     // only be one copy of the instruction generated in the loop. This is
6441     // because the VF is either 1, or any instructions that need scalarizing
6442     // have already been dealt with by the time we get here. As a result,
6443     // it means we don't have to multiply the instruction cost by VF.
6444     assert(I->getOpcode() == Instruction::GetElementPtr ||
6445            I->getOpcode() == Instruction::PHI ||
6446            (I->getOpcode() == Instruction::BitCast &&
6447             I->getType()->isPointerTy()) ||
6448            HasSingleCopyAfterVectorization(I, VF));
6449     VectorTy = RetTy;
6450   } else
6451     VectorTy = toVectorTy(RetTy, VF);
6452 
6453   if (VF.isVector() && VectorTy->isVectorTy() &&
6454       !TTI.getNumberOfParts(VectorTy))
6455     return InstructionCost::getInvalid();
6456 
6457   // TODO: We need to estimate the cost of intrinsic calls.
6458   switch (I->getOpcode()) {
6459   case Instruction::GetElementPtr:
6460     // We mark this instruction as zero-cost because the cost of GEPs in
6461     // vectorized code depends on whether the corresponding memory instruction
6462     // is scalarized or not. Therefore, we handle GEPs with the memory
6463     // instruction cost.
6464     return 0;
6465   case Instruction::Br: {
6466     // In cases of scalarized and predicated instructions, there will be VF
6467     // predicated blocks in the vectorized loop. Each branch around these
6468     // blocks requires also an extract of its vector compare i1 element.
6469     // Note that the conditional branch from the loop latch will be replaced by
6470     // a single branch controlling the loop, so there is no extra overhead from
6471     // scalarization.
6472     bool ScalarPredicatedBB = false;
6473     BranchInst *BI = cast<BranchInst>(I);
6474     if (VF.isVector() && BI->isConditional() &&
6475         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6476          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6477         BI->getParent() != TheLoop->getLoopLatch())
6478       ScalarPredicatedBB = true;
6479 
6480     if (ScalarPredicatedBB) {
6481       // Not possible to scalarize scalable vector with predicated instructions.
6482       if (VF.isScalable())
6483         return InstructionCost::getInvalid();
6484       // Return cost for branches around scalarized and predicated blocks.
6485       auto *VecI1Ty =
6486           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6487       return (
6488           TTI.getScalarizationOverhead(
6489               VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6490               /*Insert*/ false, /*Extract*/ true, CostKind) +
6491           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6492     }
6493 
6494     if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6495       // The back-edge branch will remain, as will all scalar branches.
6496       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6497 
6498     // This branch will be eliminated by if-conversion.
6499     return 0;
6500     // Note: We currently assume zero cost for an unconditional branch inside
6501     // a predicated block since it will become a fall-through, although we
6502     // may decide in the future to call TTI for all branches.
6503   }
6504   case Instruction::Switch: {
6505     if (VF.isScalar())
6506       return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6507     auto *Switch = cast<SwitchInst>(I);
6508     return Switch->getNumCases() *
6509            TTI.getCmpSelInstrCost(
6510                Instruction::ICmp,
6511                toVectorTy(Switch->getCondition()->getType(), VF),
6512                toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6513                CmpInst::ICMP_EQ, CostKind);
6514   }
6515   case Instruction::PHI: {
6516     auto *Phi = cast<PHINode>(I);
6517 
6518     // First-order recurrences are replaced by vector shuffles inside the loop.
6519     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6520       // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6521       // penultimate value of the recurrence.
6522       // TODO: Consider vscale_range info.
6523       if (VF.isScalable() && VF.getKnownMinValue() == 1)
6524         return InstructionCost::getInvalid();
6525       SmallVector<int> Mask(VF.getKnownMinValue());
6526       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6527       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6528                                 cast<VectorType>(VectorTy), Mask, CostKind,
6529                                 VF.getKnownMinValue() - 1);
6530     }
6531 
6532     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6533     // converted into select instructions. We require N - 1 selects per phi
6534     // node, where N is the number of incoming values.
6535     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6536       Type *ResultTy = Phi->getType();
6537 
6538       // All instructions in an Any-of reduction chain are narrowed to bool.
6539       // Check if that is the case for this phi node.
6540       auto *HeaderUser = cast_if_present<PHINode>(
6541           find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6542             auto *Phi = dyn_cast<PHINode>(U);
6543             if (Phi && Phi->getParent() == TheLoop->getHeader())
6544               return Phi;
6545             return nullptr;
6546           }));
6547       if (HeaderUser) {
6548         auto &ReductionVars = Legal->getReductionVars();
6549         auto Iter = ReductionVars.find(HeaderUser);
6550         if (Iter != ReductionVars.end() &&
6551             RecurrenceDescriptor::isAnyOfRecurrenceKind(
6552                 Iter->second.getRecurrenceKind()))
6553           ResultTy = Type::getInt1Ty(Phi->getContext());
6554       }
6555       return (Phi->getNumIncomingValues() - 1) *
6556              TTI.getCmpSelInstrCost(
6557                  Instruction::Select, toVectorTy(ResultTy, VF),
6558                  toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6559                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6560     }
6561 
6562     // When tail folding with EVL, if the phi is part of an out of loop
6563     // reduction then it will be transformed into a wide vp_merge.
6564     if (VF.isVector() && foldTailWithEVL() &&
6565         Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6566       IntrinsicCostAttributes ICA(
6567           Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6568           {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6569       return TTI.getIntrinsicInstrCost(ICA, CostKind);
6570     }
6571 
6572     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6573   }
6574   case Instruction::UDiv:
6575   case Instruction::SDiv:
6576   case Instruction::URem:
6577   case Instruction::SRem:
6578     if (VF.isVector() && isPredicatedInst(I)) {
6579       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6580       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6581         ScalarCost : SafeDivisorCost;
6582     }
6583     // We've proven all lanes safe to speculate, fall through.
6584     [[fallthrough]];
6585   case Instruction::Add:
6586   case Instruction::Sub: {
6587     auto Info = Legal->getHistogramInfo(I);
6588     if (Info && VF.isVector()) {
6589       const HistogramInfo *HGram = Info.value();
6590       // Assume that a non-constant update value (or a constant != 1) requires
6591       // a multiply, and add that into the cost.
6592       InstructionCost MulCost = TTI::TCC_Free;
6593       ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6594       if (!RHS || RHS->getZExtValue() != 1)
6595         MulCost =
6596             TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6597 
6598       // Find the cost of the histogram operation itself.
6599       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6600       Type *ScalarTy = I->getType();
6601       Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6602       IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6603                                   Type::getVoidTy(I->getContext()),
6604                                   {PtrTy, ScalarTy, MaskTy});
6605 
6606       // Add the costs together with the add/sub operation.
6607       return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6608              TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6609     }
6610     [[fallthrough]];
6611   }
6612   case Instruction::FAdd:
6613   case Instruction::FSub:
6614   case Instruction::Mul:
6615   case Instruction::FMul:
6616   case Instruction::FDiv:
6617   case Instruction::FRem:
6618   case Instruction::Shl:
6619   case Instruction::LShr:
6620   case Instruction::AShr:
6621   case Instruction::And:
6622   case Instruction::Or:
6623   case Instruction::Xor: {
6624     // If we're speculating on the stride being 1, the multiplication may
6625     // fold away.  We can generalize this for all operations using the notion
6626     // of neutral elements.  (TODO)
6627     if (I->getOpcode() == Instruction::Mul &&
6628         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6629          PSE.getSCEV(I->getOperand(1))->isOne()))
6630       return 0;
6631 
6632     // Detect reduction patterns
6633     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6634       return *RedCost;
6635 
6636     // Certain instructions can be cheaper to vectorize if they have a constant
6637     // second vector operand. One example of this are shifts on x86.
6638     Value *Op2 = I->getOperand(1);
6639     if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6640         isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6641       Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6642     }
6643     auto Op2Info = TTI.getOperandInfo(Op2);
6644     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6645         shouldConsiderInvariant(Op2))
6646       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6647 
6648     SmallVector<const Value *, 4> Operands(I->operand_values());
6649     return TTI.getArithmeticInstrCost(
6650         I->getOpcode(), VectorTy, CostKind,
6651         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6652         Op2Info, Operands, I, TLI);
6653   }
6654   case Instruction::FNeg: {
6655     return TTI.getArithmeticInstrCost(
6656         I->getOpcode(), VectorTy, CostKind,
6657         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6658         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6659         I->getOperand(0), I);
6660   }
6661   case Instruction::Select: {
6662     SelectInst *SI = cast<SelectInst>(I);
6663     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6664     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6665 
6666     const Value *Op0, *Op1;
6667     using namespace llvm::PatternMatch;
6668     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6669                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6670       // select x, y, false --> x & y
6671       // select x, true, y --> x | y
6672       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6673       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6674       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6675               Op1->getType()->getScalarSizeInBits() == 1);
6676 
6677       SmallVector<const Value *, 2> Operands{Op0, Op1};
6678       return TTI.getArithmeticInstrCost(
6679           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6680           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6681     }
6682 
6683     Type *CondTy = SI->getCondition()->getType();
6684     if (!ScalarCond)
6685       CondTy = VectorType::get(CondTy, VF);
6686 
6687     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6688     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6689       Pred = Cmp->getPredicate();
6690     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6691                                   CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6692                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6693   }
6694   case Instruction::ICmp:
6695   case Instruction::FCmp: {
6696     Type *ValTy = I->getOperand(0)->getType();
6697 
6698     if (canTruncateToMinimalBitwidth(I, VF)) {
6699       Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6700       (void)Op0AsInstruction;
6701       assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6702               MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6703              "if both the operand and the compare are marked for "
6704              "truncation, they must have the same bitwidth");
6705       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6706     }
6707 
6708     VectorTy = toVectorTy(ValTy, VF);
6709     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6710                                   cast<CmpInst>(I)->getPredicate(), CostKind,
6711                                   {TTI::OK_AnyValue, TTI::OP_None},
6712                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6713   }
6714   case Instruction::Store:
6715   case Instruction::Load: {
6716     ElementCount Width = VF;
6717     if (Width.isVector()) {
6718       InstWidening Decision = getWideningDecision(I, Width);
6719       assert(Decision != CM_Unknown &&
6720              "CM decision should be taken at this point");
6721       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6722         return InstructionCost::getInvalid();
6723       if (Decision == CM_Scalarize)
6724         Width = ElementCount::getFixed(1);
6725     }
6726     VectorTy = toVectorTy(getLoadStoreType(I), Width);
6727     return getMemoryInstructionCost(I, VF);
6728   }
6729   case Instruction::BitCast:
6730     if (I->getType()->isPointerTy())
6731       return 0;
6732     [[fallthrough]];
6733   case Instruction::ZExt:
6734   case Instruction::SExt:
6735   case Instruction::FPToUI:
6736   case Instruction::FPToSI:
6737   case Instruction::FPExt:
6738   case Instruction::PtrToInt:
6739   case Instruction::IntToPtr:
6740   case Instruction::SIToFP:
6741   case Instruction::UIToFP:
6742   case Instruction::Trunc:
6743   case Instruction::FPTrunc: {
6744     // Computes the CastContextHint from a Load/Store instruction.
6745     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6746       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6747              "Expected a load or a store!");
6748 
6749       if (VF.isScalar() || !TheLoop->contains(I))
6750         return TTI::CastContextHint::Normal;
6751 
6752       switch (getWideningDecision(I, VF)) {
6753       case LoopVectorizationCostModel::CM_GatherScatter:
6754         return TTI::CastContextHint::GatherScatter;
6755       case LoopVectorizationCostModel::CM_Interleave:
6756         return TTI::CastContextHint::Interleave;
6757       case LoopVectorizationCostModel::CM_Scalarize:
6758       case LoopVectorizationCostModel::CM_Widen:
6759         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6760                                         : TTI::CastContextHint::Normal;
6761       case LoopVectorizationCostModel::CM_Widen_Reverse:
6762         return TTI::CastContextHint::Reversed;
6763       case LoopVectorizationCostModel::CM_Unknown:
6764         llvm_unreachable("Instr did not go through cost modelling?");
6765       case LoopVectorizationCostModel::CM_VectorCall:
6766       case LoopVectorizationCostModel::CM_IntrinsicCall:
6767         llvm_unreachable_internal("Instr has invalid widening decision");
6768       }
6769 
6770       llvm_unreachable("Unhandled case!");
6771     };
6772 
6773     unsigned Opcode = I->getOpcode();
6774     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6775     // For Trunc, the context is the only user, which must be a StoreInst.
6776     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6777       if (I->hasOneUse())
6778         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6779           CCH = ComputeCCH(Store);
6780     }
6781     // For Z/Sext, the context is the operand, which must be a LoadInst.
6782     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6783              Opcode == Instruction::FPExt) {
6784       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6785         CCH = ComputeCCH(Load);
6786     }
6787 
6788     // We optimize the truncation of induction variables having constant
6789     // integer steps. The cost of these truncations is the same as the scalar
6790     // operation.
6791     if (isOptimizableIVTruncate(I, VF)) {
6792       auto *Trunc = cast<TruncInst>(I);
6793       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6794                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6795     }
6796 
6797     // Detect reduction patterns
6798     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6799       return *RedCost;
6800 
6801     Type *SrcScalarTy = I->getOperand(0)->getType();
6802     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6803     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6804       SrcScalarTy =
6805           IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6806     Type *SrcVecTy =
6807         VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6808 
6809     if (canTruncateToMinimalBitwidth(I, VF)) {
6810       // If the result type is <= the source type, there will be no extend
6811       // after truncating the users to the minimal required bitwidth.
6812       if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6813           (I->getOpcode() == Instruction::ZExt ||
6814            I->getOpcode() == Instruction::SExt))
6815         return 0;
6816     }
6817 
6818     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6819   }
6820   case Instruction::Call:
6821     return getVectorCallCost(cast<CallInst>(I), VF);
6822   case Instruction::ExtractValue:
6823     return TTI.getInstructionCost(I, CostKind);
6824   case Instruction::Alloca:
6825     // We cannot easily widen alloca to a scalable alloca, as
6826     // the result would need to be a vector of pointers.
6827     if (VF.isScalable())
6828       return InstructionCost::getInvalid();
6829     [[fallthrough]];
6830   default:
6831     // This opcode is unknown. Assume that it is the same as 'mul'.
6832     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6833   } // end of switch.
6834 }
6835 
6836 void LoopVectorizationCostModel::collectValuesToIgnore() {
6837   // Ignore ephemeral values.
6838   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6839 
6840   SmallVector<Value *, 4> DeadInterleavePointerOps;
6841   SmallVector<Value *, 4> DeadOps;
6842 
6843   // If a scalar epilogue is required, users outside the loop won't use
6844   // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6845   // that is the case.
6846   bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6847   auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6848     return RequiresScalarEpilogue &&
6849            !TheLoop->contains(cast<Instruction>(U)->getParent());
6850   };
6851 
6852   LoopBlocksDFS DFS(TheLoop);
6853   DFS.perform(LI);
6854   MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6855   for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6856     for (Instruction &I : reverse(*BB)) {
6857       // Find all stores to invariant variables. Since they are going to sink
6858       // outside the loop we do not need calculate cost for them.
6859       StoreInst *SI;
6860       if ((SI = dyn_cast<StoreInst>(&I)) &&
6861           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6862         ValuesToIgnore.insert(&I);
6863         DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6864             SI->getValueOperand());
6865       }
6866 
6867       if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6868         continue;
6869 
6870       // Add instructions that would be trivially dead and are only used by
6871       // values already ignored to DeadOps to seed worklist.
6872       if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6873           all_of(I.users(), [this, IsLiveOutDead](User *U) {
6874             return VecValuesToIgnore.contains(U) ||
6875                    ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6876           }))
6877         DeadOps.push_back(&I);
6878 
6879       // For interleave groups, we only create a pointer for the start of the
6880       // interleave group. Queue up addresses of group members except the insert
6881       // position for further processing.
6882       if (isAccessInterleaved(&I)) {
6883         auto *Group = getInterleavedAccessGroup(&I);
6884         if (Group->getInsertPos() == &I)
6885           continue;
6886         Value *PointerOp = getLoadStorePointerOperand(&I);
6887         DeadInterleavePointerOps.push_back(PointerOp);
6888       }
6889 
6890       // Queue branches for analysis. They are dead, if their successors only
6891       // contain dead instructions.
6892       if (auto *Br = dyn_cast<BranchInst>(&I)) {
6893         if (Br->isConditional())
6894           DeadOps.push_back(&I);
6895       }
6896     }
6897 
6898   // Mark ops feeding interleave group members as free, if they are only used
6899   // by other dead computations.
6900   for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6901     auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6902     if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6903           Instruction *UI = cast<Instruction>(U);
6904           return !VecValuesToIgnore.contains(U) &&
6905                  (!isAccessInterleaved(UI) ||
6906                   getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6907         }))
6908       continue;
6909     VecValuesToIgnore.insert(Op);
6910     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6911   }
6912 
6913   for (const auto &[_, Ops] : DeadInvariantStoreOps) {
6914     for (Value *Op : ArrayRef(Ops).drop_back())
6915       DeadOps.push_back(Op);
6916   }
6917   // Mark ops that would be trivially dead and are only used by ignored
6918   // instructions as free.
6919   BasicBlock *Header = TheLoop->getHeader();
6920 
6921   // Returns true if the block contains only dead instructions. Such blocks will
6922   // be removed by VPlan-to-VPlan transforms and won't be considered by the
6923   // VPlan-based cost model, so skip them in the legacy cost-model as well.
6924   auto IsEmptyBlock = [this](BasicBlock *BB) {
6925     return all_of(*BB, [this](Instruction &I) {
6926       return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6927              (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6928     });
6929   };
6930   for (unsigned I = 0; I != DeadOps.size(); ++I) {
6931     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6932 
6933     // Check if the branch should be considered dead.
6934     if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6935       BasicBlock *ThenBB = Br->getSuccessor(0);
6936       BasicBlock *ElseBB = Br->getSuccessor(1);
6937       // Don't considers branches leaving the loop for simplification.
6938       if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6939         continue;
6940       bool ThenEmpty = IsEmptyBlock(ThenBB);
6941       bool ElseEmpty = IsEmptyBlock(ElseBB);
6942       if ((ThenEmpty && ElseEmpty) ||
6943           (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6944            ElseBB->phis().empty()) ||
6945           (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6946            ThenBB->phis().empty())) {
6947         VecValuesToIgnore.insert(Br);
6948         DeadOps.push_back(Br->getCondition());
6949       }
6950       continue;
6951     }
6952 
6953     // Skip any op that shouldn't be considered dead.
6954     if (!Op || !TheLoop->contains(Op) ||
6955         (isa<PHINode>(Op) && Op->getParent() == Header) ||
6956         !wouldInstructionBeTriviallyDead(Op, TLI) ||
6957         any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6958           return !VecValuesToIgnore.contains(U) &&
6959                  !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6960         }))
6961       continue;
6962 
6963     if (!TheLoop->contains(Op->getParent()))
6964       continue;
6965 
6966     // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6967     // which applies for both scalar and vector versions. Otherwise it is only
6968     // dead in vector versions, so only add it to VecValuesToIgnore.
6969     if (all_of(Op->users(),
6970                [this](User *U) { return ValuesToIgnore.contains(U); }))
6971       ValuesToIgnore.insert(Op);
6972 
6973     VecValuesToIgnore.insert(Op);
6974     DeadOps.append(Op->op_begin(), Op->op_end());
6975   }
6976 
6977   // Ignore type-promoting instructions we identified during reduction
6978   // detection.
6979   for (const auto &Reduction : Legal->getReductionVars()) {
6980     const RecurrenceDescriptor &RedDes = Reduction.second;
6981     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6982     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6983   }
6984   // Ignore type-casting instructions we identified during induction
6985   // detection.
6986   for (const auto &Induction : Legal->getInductionVars()) {
6987     const InductionDescriptor &IndDes = Induction.second;
6988     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6989     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6990   }
6991 }
6992 
6993 void LoopVectorizationCostModel::collectInLoopReductions() {
6994   for (const auto &Reduction : Legal->getReductionVars()) {
6995     PHINode *Phi = Reduction.first;
6996     const RecurrenceDescriptor &RdxDesc = Reduction.second;
6997 
6998     // We don't collect reductions that are type promoted (yet).
6999     if (RdxDesc.getRecurrenceType() != Phi->getType())
7000       continue;
7001 
7002     // If the target would prefer this reduction to happen "in-loop", then we
7003     // want to record it as such.
7004     unsigned Opcode = RdxDesc.getOpcode();
7005     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7006         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7007                                    TargetTransformInfo::ReductionFlags()))
7008       continue;
7009 
7010     // Check that we can correctly put the reductions into the loop, by
7011     // finding the chain of operations that leads from the phi to the loop
7012     // exit value.
7013     SmallVector<Instruction *, 4> ReductionOperations =
7014         RdxDesc.getReductionOpChain(Phi, TheLoop);
7015     bool InLoop = !ReductionOperations.empty();
7016 
7017     if (InLoop) {
7018       InLoopReductions.insert(Phi);
7019       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7020       Instruction *LastChain = Phi;
7021       for (auto *I : ReductionOperations) {
7022         InLoopReductionImmediateChains[I] = LastChain;
7023         LastChain = I;
7024       }
7025     }
7026     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7027                       << " reduction for phi: " << *Phi << "\n");
7028   }
7029 }
7030 
7031 // This function will select a scalable VF if the target supports scalable
7032 // vectors and a fixed one otherwise.
7033 // TODO: we could return a pair of values that specify the max VF and
7034 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7035 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7036 // doesn't have a cost model that can choose which plan to execute if
7037 // more than one is generated.
7038 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7039                                      LoopVectorizationCostModel &CM) {
7040   unsigned WidestType;
7041   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7042 
7043   TargetTransformInfo::RegisterKind RegKind =
7044       TTI.enableScalableVectorization()
7045           ? TargetTransformInfo::RGK_ScalableVector
7046           : TargetTransformInfo::RGK_FixedWidthVector;
7047 
7048   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7049   unsigned N = RegSize.getKnownMinValue() / WidestType;
7050   return ElementCount::get(N, RegSize.isScalable());
7051 }
7052 
7053 VectorizationFactor
7054 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7055   ElementCount VF = UserVF;
7056   // Outer loop handling: They may require CFG and instruction level
7057   // transformations before even evaluating whether vectorization is profitable.
7058   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7059   // the vectorization pipeline.
7060   if (!OrigLoop->isInnermost()) {
7061     // If the user doesn't provide a vectorization factor, determine a
7062     // reasonable one.
7063     if (UserVF.isZero()) {
7064       VF = determineVPlanVF(TTI, CM);
7065       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7066 
7067       // Make sure we have a VF > 1 for stress testing.
7068       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7069         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7070                           << "overriding computed VF.\n");
7071         VF = ElementCount::getFixed(4);
7072       }
7073     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7074                !ForceTargetSupportsScalableVectors) {
7075       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7076                         << "not supported by the target.\n");
7077       reportVectorizationFailure(
7078           "Scalable vectorization requested but not supported by the target",
7079           "the scalable user-specified vectorization width for outer-loop "
7080           "vectorization cannot be used because the target does not support "
7081           "scalable vectors.",
7082           "ScalableVFUnfeasible", ORE, OrigLoop);
7083       return VectorizationFactor::Disabled();
7084     }
7085     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7086     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7087            "VF needs to be a power of two");
7088     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7089                       << "VF " << VF << " to build VPlans.\n");
7090     buildVPlans(VF, VF);
7091 
7092     // For VPlan build stress testing, we bail out after VPlan construction.
7093     if (VPlanBuildStressTest)
7094       return VectorizationFactor::Disabled();
7095 
7096     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7097   }
7098 
7099   LLVM_DEBUG(
7100       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7101                 "VPlan-native path.\n");
7102   return VectorizationFactor::Disabled();
7103 }
7104 
7105 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7106   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7107   CM.collectValuesToIgnore();
7108   CM.collectElementTypesForWidening();
7109 
7110   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7111   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7112     return;
7113 
7114   // Invalidate interleave groups if all blocks of loop will be predicated.
7115   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7116       !useMaskedInterleavedAccesses(TTI)) {
7117     LLVM_DEBUG(
7118         dbgs()
7119         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7120            "which requires masked-interleaved support.\n");
7121     if (CM.InterleaveInfo.invalidateGroups())
7122       // Invalidating interleave groups also requires invalidating all decisions
7123       // based on them, which includes widening decisions and uniform and scalar
7124       // values.
7125       CM.invalidateCostModelingDecisions();
7126   }
7127 
7128   if (CM.foldTailByMasking())
7129     Legal->prepareToFoldTailByMasking();
7130 
7131   ElementCount MaxUserVF =
7132       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7133   if (UserVF) {
7134     if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7135       reportVectorizationInfo(
7136           "UserVF ignored because it may be larger than the maximal safe VF",
7137           "InvalidUserVF", ORE, OrigLoop);
7138     } else {
7139       assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7140              "VF needs to be a power of two");
7141       // Collect the instructions (and their associated costs) that will be more
7142       // profitable to scalarize.
7143       CM.collectInLoopReductions();
7144       if (CM.selectUserVectorizationFactor(UserVF)) {
7145         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7146         buildVPlansWithVPRecipes(UserVF, UserVF);
7147         LLVM_DEBUG(printPlans(dbgs()));
7148         return;
7149       }
7150       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7151                               "InvalidCost", ORE, OrigLoop);
7152     }
7153   }
7154 
7155   // Collect the Vectorization Factor Candidates.
7156   SmallVector<ElementCount> VFCandidates;
7157   for (auto VF = ElementCount::getFixed(1);
7158        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7159     VFCandidates.push_back(VF);
7160   for (auto VF = ElementCount::getScalable(1);
7161        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7162     VFCandidates.push_back(VF);
7163 
7164   CM.collectInLoopReductions();
7165   for (const auto &VF : VFCandidates) {
7166     // Collect Uniform and Scalar instructions after vectorization with VF.
7167     CM.collectUniformsAndScalars(VF);
7168 
7169     // Collect the instructions (and their associated costs) that will be more
7170     // profitable to scalarize.
7171     if (VF.isVector())
7172       CM.collectInstsToScalarize(VF);
7173   }
7174 
7175   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7176   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7177 
7178   LLVM_DEBUG(printPlans(dbgs()));
7179 }
7180 
7181 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7182                                              ElementCount VF) const {
7183   if (ForceTargetInstructionCost.getNumOccurrences())
7184     return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7185   return CM.getInstructionCost(UI, VF);
7186 }
7187 
7188 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7189   return CM.ValuesToIgnore.contains(UI) ||
7190          (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7191          SkipCostComputation.contains(UI);
7192 }
7193 
7194 InstructionCost
7195 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7196                                           VPCostContext &CostCtx) const {
7197   InstructionCost Cost;
7198   // Cost modeling for inductions is inaccurate in the legacy cost model
7199   // compared to the recipes that are generated. To match here initially during
7200   // VPlan cost model bring up directly use the induction costs from the legacy
7201   // cost model. Note that we do this as pre-processing; the VPlan may not have
7202   // any recipes associated with the original induction increment instruction
7203   // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7204   // the cost of induction phis and increments (both that are represented by
7205   // recipes and those that are not), to avoid distinguishing between them here,
7206   // and skip all recipes that represent induction phis and increments (the
7207   // former case) later on, if they exist, to avoid counting them twice.
7208   // Similarly we pre-compute the cost of any optimized truncates.
7209   // TODO: Switch to more accurate costing based on VPlan.
7210   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7211     Instruction *IVInc = cast<Instruction>(
7212         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7213     SmallVector<Instruction *> IVInsts = {IVInc};
7214     for (unsigned I = 0; I != IVInsts.size(); I++) {
7215       for (Value *Op : IVInsts[I]->operands()) {
7216         auto *OpI = dyn_cast<Instruction>(Op);
7217         if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7218           continue;
7219         IVInsts.push_back(OpI);
7220       }
7221     }
7222     IVInsts.push_back(IV);
7223     for (User *U : IV->users()) {
7224       auto *CI = cast<Instruction>(U);
7225       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7226         continue;
7227       IVInsts.push_back(CI);
7228     }
7229 
7230     // If the vector loop gets executed exactly once with the given VF, ignore
7231     // the costs of comparison and induction instructions, as they'll get
7232     // simplified away.
7233     // TODO: Remove this code after stepping away from the legacy cost model and
7234     // adding code to simplify VPlans before calculating their costs.
7235     auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7236     if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7237       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
7238                                            CostCtx.SkipCostComputation);
7239 
7240     for (Instruction *IVInst : IVInsts) {
7241       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7242         continue;
7243       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7244       LLVM_DEBUG({
7245         dbgs() << "Cost of " << InductionCost << " for VF " << VF
7246                << ": induction instruction " << *IVInst << "\n";
7247       });
7248       Cost += InductionCost;
7249       CostCtx.SkipCostComputation.insert(IVInst);
7250     }
7251   }
7252 
7253   /// Compute the cost of all exiting conditions of the loop using the legacy
7254   /// cost model. This is to match the legacy behavior, which adds the cost of
7255   /// all exit conditions. Note that this over-estimates the cost, as there will
7256   /// be a single condition to control the vector loop.
7257   SmallVector<BasicBlock *> Exiting;
7258   CM.TheLoop->getExitingBlocks(Exiting);
7259   SetVector<Instruction *> ExitInstrs;
7260   // Collect all exit conditions.
7261   for (BasicBlock *EB : Exiting) {
7262     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7263     if (!Term)
7264       continue;
7265     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7266       ExitInstrs.insert(CondI);
7267     }
7268   }
7269   // Compute the cost of all instructions only feeding the exit conditions.
7270   for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7271     Instruction *CondI = ExitInstrs[I];
7272     if (!OrigLoop->contains(CondI) ||
7273         !CostCtx.SkipCostComputation.insert(CondI).second)
7274       continue;
7275     InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7276     LLVM_DEBUG({
7277       dbgs() << "Cost of " << CondICost << " for VF " << VF
7278              << ": exit condition instruction " << *CondI << "\n";
7279     });
7280     Cost += CondICost;
7281     for (Value *Op : CondI->operands()) {
7282       auto *OpI = dyn_cast<Instruction>(Op);
7283       if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7284             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7285                    !ExitInstrs.contains(cast<Instruction>(U));
7286           }))
7287         continue;
7288       ExitInstrs.insert(OpI);
7289     }
7290   }
7291 
7292   // The legacy cost model has special logic to compute the cost of in-loop
7293   // reductions, which may be smaller than the sum of all instructions involved
7294   // in the reduction.
7295   // TODO: Switch to costing based on VPlan once the logic has been ported.
7296   for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7297     if (ForceTargetInstructionCost.getNumOccurrences())
7298       continue;
7299 
7300     if (!CM.isInLoopReduction(RedPhi))
7301       continue;
7302 
7303     const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7304     SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7305                                                  ChainOps.end());
7306     auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7307       return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7308     };
7309     // Also include the operands of instructions in the chain, as the cost-model
7310     // may mark extends as free.
7311     //
7312     // For ARM, some of the instruction can folded into the reducion
7313     // instruction. So we need to mark all folded instructions free.
7314     // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7315     // instruction.
7316     for (auto *ChainOp : ChainOps) {
7317       for (Value *Op : ChainOp->operands()) {
7318         if (auto *I = dyn_cast<Instruction>(Op)) {
7319           ChainOpsAndOperands.insert(I);
7320           if (I->getOpcode() == Instruction::Mul) {
7321             auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7322             auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7323             if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7324                 Ext0->getOpcode() == Ext1->getOpcode()) {
7325               ChainOpsAndOperands.insert(Ext0);
7326               ChainOpsAndOperands.insert(Ext1);
7327             }
7328           }
7329         }
7330       }
7331     }
7332 
7333     // Pre-compute the cost for I, if it has a reduction pattern cost.
7334     for (Instruction *I : ChainOpsAndOperands) {
7335       auto ReductionCost =
7336           CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7337       if (!ReductionCost)
7338         continue;
7339 
7340       assert(!CostCtx.SkipCostComputation.contains(I) &&
7341              "reduction op visited multiple times");
7342       CostCtx.SkipCostComputation.insert(I);
7343       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7344                         << ":\n in-loop reduction " << *I << "\n");
7345       Cost += *ReductionCost;
7346     }
7347   }
7348 
7349   // Pre-compute the costs for branches except for the backedge, as the number
7350   // of replicate regions in a VPlan may not directly match the number of
7351   // branches, which would lead to different decisions.
7352   // TODO: Compute cost of branches for each replicate region in the VPlan,
7353   // which is more accurate than the legacy cost model.
7354   for (BasicBlock *BB : OrigLoop->blocks()) {
7355     if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7356       continue;
7357     CostCtx.SkipCostComputation.insert(BB->getTerminator());
7358     if (BB == OrigLoop->getLoopLatch())
7359       continue;
7360     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7361     Cost += BranchCost;
7362   }
7363 
7364   // Pre-compute costs for instructions that are forced-scalar or profitable to
7365   // scalarize. Their costs will be computed separately in the legacy cost
7366   // model.
7367   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7368     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7369       continue;
7370     CostCtx.SkipCostComputation.insert(ForcedScalar);
7371     InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7372     LLVM_DEBUG({
7373       dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7374              << ": forced scalar " << *ForcedScalar << "\n";
7375     });
7376     Cost += ForcedCost;
7377   }
7378   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7379     if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7380       continue;
7381     CostCtx.SkipCostComputation.insert(Scalarized);
7382     LLVM_DEBUG({
7383       dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7384              << ": profitable to scalarize " << *Scalarized << "\n";
7385     });
7386     Cost += ScalarCost;
7387   }
7388 
7389   return Cost;
7390 }
7391 
7392 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7393                                                ElementCount VF) const {
7394   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7395                         CM.CostKind);
7396   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7397 
7398   // Now compute and add the VPlan-based cost.
7399   Cost += Plan.cost(VF, CostCtx);
7400 #ifndef NDEBUG
7401   unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7402   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7403                     << " (Estimated cost per lane: ");
7404   if (Cost.isValid()) {
7405     double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7406     LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7407   } else /* No point dividing an invalid cost - it will still be invalid */
7408     LLVM_DEBUG(dbgs() << "Invalid");
7409   LLVM_DEBUG(dbgs() << ")\n");
7410 #endif
7411   return Cost;
7412 }
7413 
7414 #ifndef NDEBUG
7415 /// Return true if the original loop \ TheLoop contains any instructions that do
7416 /// not have corresponding recipes in \p Plan and are not marked to be ignored
7417 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
7418 /// cost-model did not account for.
7419 static bool planContainsAdditionalSimplifications(VPlan &Plan,
7420                                                   VPCostContext &CostCtx,
7421                                                   Loop *TheLoop) {
7422   // First collect all instructions for the recipes in Plan.
7423   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7424     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7425       return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7426     if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7427       return &WidenMem->getIngredient();
7428     return nullptr;
7429   };
7430 
7431   DenseSet<Instruction *> SeenInstrs;
7432   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7433   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7434     for (VPRecipeBase &R : *VPBB) {
7435       if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7436         auto *IG = IR->getInterleaveGroup();
7437         unsigned NumMembers = IG->getNumMembers();
7438         for (unsigned I = 0; I != NumMembers; ++I) {
7439           if (Instruction *M = IG->getMember(I))
7440             SeenInstrs.insert(M);
7441         }
7442         continue;
7443       }
7444       // The VPlan-based cost model is more accurate for partial reduction and
7445       // comparing against the legacy cost isn't desirable.
7446       if (isa<VPPartialReductionRecipe>(&R))
7447         return true;
7448       if (Instruction *UI = GetInstructionForCost(&R))
7449         SeenInstrs.insert(UI);
7450     }
7451   }
7452 
7453   // Return true if the loop contains any instructions that are not also part of
7454   // the VPlan or are skipped for VPlan-based cost computations. This indicates
7455   // that the VPlan contains extra simplifications.
7456   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7457                                     TheLoop](BasicBlock *BB) {
7458     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7459       if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7460         return false;
7461       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7462     });
7463   });
7464 }
7465 #endif
7466 
7467 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7468   if (VPlans.empty())
7469     return VectorizationFactor::Disabled();
7470   // If there is a single VPlan with a single VF, return it directly.
7471   VPlan &FirstPlan = *VPlans[0];
7472   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7473     return {*FirstPlan.vectorFactors().begin(), 0, 0};
7474 
7475   LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7476                     << (CM.CostKind == TTI::TCK_RecipThroughput
7477                             ? "Reciprocal Throughput\n"
7478                         : CM.CostKind == TTI::TCK_Latency
7479                             ? "Instruction Latency\n"
7480                         : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7481                         : CM.CostKind == TTI::TCK_SizeAndLatency
7482                             ? "Code Size and Latency\n"
7483                             : "Unknown\n"));
7484 
7485   ElementCount ScalarVF = ElementCount::getFixed(1);
7486   assert(hasPlanWithVF(ScalarVF) &&
7487          "More than a single plan/VF w/o any plan having scalar VF");
7488 
7489   // TODO: Compute scalar cost using VPlan-based cost model.
7490   InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7491   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7492   VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7493   VectorizationFactor BestFactor = ScalarFactor;
7494 
7495   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7496   if (ForceVectorization) {
7497     // Ignore scalar width, because the user explicitly wants vectorization.
7498     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7499     // evaluation.
7500     BestFactor.Cost = InstructionCost::getMax();
7501   }
7502 
7503   for (auto &P : VPlans) {
7504     for (ElementCount VF : P->vectorFactors()) {
7505       if (VF.isScalar())
7506         continue;
7507       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7508         LLVM_DEBUG(
7509             dbgs()
7510             << "LV: Not considering vector loop of width " << VF
7511             << " because it will not generate any vector instructions.\n");
7512         continue;
7513       }
7514 
7515       InstructionCost Cost = cost(*P, VF);
7516       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7517       if (isMoreProfitable(CurrentFactor, BestFactor))
7518         BestFactor = CurrentFactor;
7519 
7520       // If profitable add it to ProfitableVF list.
7521       if (isMoreProfitable(CurrentFactor, ScalarFactor))
7522         ProfitableVFs.push_back(CurrentFactor);
7523     }
7524   }
7525 
7526 #ifndef NDEBUG
7527   // Select the optimal vectorization factor according to the legacy cost-model.
7528   // This is now only used to verify the decisions by the new VPlan-based
7529   // cost-model and will be retired once the VPlan-based cost-model is
7530   // stabilized.
7531   VectorizationFactor LegacyVF = selectVectorizationFactor();
7532   VPlan &BestPlan = getPlanFor(BestFactor.Width);
7533 
7534   // Pre-compute the cost and use it to check if BestPlan contains any
7535   // simplifications not accounted for in the legacy cost model. If that's the
7536   // case, don't trigger the assertion, as the extra simplifications may cause a
7537   // different VF to be picked by the VPlan-based cost model.
7538   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7539                         CM.CostKind);
7540   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7541   assert((BestFactor.Width == LegacyVF.Width ||
7542           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7543                                                 CostCtx, OrigLoop) ||
7544           planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
7545                                                 CostCtx, OrigLoop)) &&
7546          " VPlan cost model and legacy cost model disagreed");
7547   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7548          "when vectorizing, the scalar cost must be computed.");
7549 #endif
7550 
7551   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7552   return BestFactor;
7553 }
7554 
7555 static void addRuntimeUnrollDisableMetaData(Loop *L) {
7556   SmallVector<Metadata *, 4> MDs;
7557   // Reserve first location for self reference to the LoopID metadata node.
7558   MDs.push_back(nullptr);
7559   bool IsUnrollMetadata = false;
7560   MDNode *LoopID = L->getLoopID();
7561   if (LoopID) {
7562     // First find existing loop unrolling disable metadata.
7563     for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7564       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7565       if (MD) {
7566         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7567         IsUnrollMetadata =
7568             S && S->getString().starts_with("llvm.loop.unroll.disable");
7569       }
7570       MDs.push_back(LoopID->getOperand(I));
7571     }
7572   }
7573 
7574   if (!IsUnrollMetadata) {
7575     // Add runtime unroll disable metadata.
7576     LLVMContext &Context = L->getHeader()->getContext();
7577     SmallVector<Metadata *, 1> DisableOperands;
7578     DisableOperands.push_back(
7579         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7580     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7581     MDs.push_back(DisableNode);
7582     MDNode *NewLoopID = MDNode::get(Context, MDs);
7583     // Set operand 0 to refer to the loop id itself.
7584     NewLoopID->replaceOperandWith(0, NewLoopID);
7585     L->setLoopID(NewLoopID);
7586   }
7587 }
7588 
7589 // If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7590 // fix the reduction's scalar PHI node by adding the incoming value from the
7591 // main vector loop.
7592 static void fixReductionScalarResumeWhenVectorizingEpilog(
7593     VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7594     BasicBlock *BypassBlock) {
7595   auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7596   if (!EpiRedResult ||
7597       EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7598     return;
7599 
7600   auto *EpiRedHeaderPhi =
7601       cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7602   const RecurrenceDescriptor &RdxDesc =
7603       EpiRedHeaderPhi->getRecurrenceDescriptor();
7604   Value *MainResumeValue =
7605       EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7606   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7607           RdxDesc.getRecurrenceKind())) {
7608     auto *Cmp = cast<ICmpInst>(MainResumeValue);
7609     assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7610            "AnyOf expected to start with ICMP_NE");
7611     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7612            "AnyOf expected to start by comparing main resume value to original "
7613            "start value");
7614     MainResumeValue = Cmp->getOperand(0);
7615   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
7616                  RdxDesc.getRecurrenceKind())) {
7617     using namespace llvm::PatternMatch;
7618     Value *Cmp, *OrigResumeV;
7619     bool IsExpectedPattern =
7620         match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7621                                         m_Specific(RdxDesc.getSentinelValue()),
7622                                         m_Value(OrigResumeV))) &&
7623         match(Cmp,
7624               m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7625                              m_Specific(RdxDesc.getRecurrenceStartValue())));
7626     assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7627     (void)IsExpectedPattern;
7628     MainResumeValue = OrigResumeV;
7629   }
7630   PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7631 
7632   // When fixing reductions in the epilogue loop we should already have
7633   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7634   // over the incoming values correctly.
7635   using namespace VPlanPatternMatch;
7636   auto IsResumePhi = [](VPUser *U) {
7637     return match(
7638         U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7639   };
7640   assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7641          "ResumePhi must have a single user");
7642   auto *EpiResumePhiVPI =
7643       cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7644   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7645   EpiResumePhi->setIncomingValueForBlock(
7646       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7647 }
7648 
7649 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7650     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7651     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7652     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7653   assert(BestVPlan.hasVF(BestVF) &&
7654          "Trying to execute plan with unsupported VF");
7655   assert(BestVPlan.hasUF(BestUF) &&
7656          "Trying to execute plan with unsupported UF");
7657   assert(
7658       ((VectorizingEpilogue && ExpandedSCEVs) ||
7659        (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7660       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7661 
7662   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7663   // cost model is complete for better cost estimates.
7664   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
7665                            OrigLoop->getHeader()->getContext());
7666   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7667   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
7668 
7669   // Perform the actual loop transformation.
7670   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7671                          &BestVPlan, OrigLoop->getParentLoop(),
7672                          Legal->getWidestInductionType());
7673 
7674 #ifdef EXPENSIVE_CHECKS
7675   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7676 #endif
7677 
7678   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7679   // making any changes to the CFG.
7680   if (!BestVPlan.getEntry()->empty())
7681     BestVPlan.getEntry()->execute(&State);
7682 
7683   if (!ILV.getTripCount())
7684     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7685   else
7686     assert(VectorizingEpilogue && "should only re-use the existing trip "
7687                                   "count during epilogue vectorization");
7688 
7689   // 1. Set up the skeleton for vectorization, including vector pre-header and
7690   // middle block. The vector loop is created during VPlan execution.
7691   VPBasicBlock *VectorPH =
7692       cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7693   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7694       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7695   if (VectorizingEpilogue)
7696     VPlanTransforms::removeDeadRecipes(BestVPlan);
7697 
7698   // Only use noalias metadata when using memory checks guaranteeing no overlap
7699   // across all iterations.
7700   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7701   std::unique_ptr<LoopVersioning> LVer = nullptr;
7702   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7703       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7704 
7705     //  We currently don't use LoopVersioning for the actual loop cloning but we
7706     //  still use it to add the noalias metadata.
7707     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7708     //        metadata.
7709     LVer = std::make_unique<LoopVersioning>(
7710         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7711         PSE.getSE());
7712     State.LVer = &*LVer;
7713     State.LVer->prepareNoAliasMetadata();
7714   }
7715 
7716   ILV.printDebugTracesAtStart();
7717 
7718   //===------------------------------------------------===//
7719   //
7720   // Notice: any optimization or new instruction that go
7721   // into the code below should also be implemented in
7722   // the cost-model.
7723   //
7724   //===------------------------------------------------===//
7725 
7726   // 2. Copy and widen instructions from the old loop into the new loop.
7727   BestVPlan.prepareToExecute(
7728       ILV.getTripCount(),
7729       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7730   replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7731 
7732   BestVPlan.execute(&State);
7733 
7734   auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7735   // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7736   // values from the additional bypass block.
7737   if (VectorizingEpilogue) {
7738     assert(!ILV.Legal->hasUncountableEarlyExit() &&
7739            "Epilogue vectorisation not yet supported with early exits");
7740     BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7741     for (VPRecipeBase &R : *MiddleVPBB) {
7742       fixReductionScalarResumeWhenVectorizingEpilog(
7743           &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7744     }
7745     BasicBlock *PH = OrigLoop->getLoopPreheader();
7746     for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7747       auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7748       Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
7749       Inc->setIncomingValueForBlock(BypassBlock, V);
7750     }
7751   }
7752 
7753   // 2.6. Maintain Loop Hints
7754   // Keep all loop hints from the original loop on the vector loop (we'll
7755   // replace the vectorizer-specific hints below).
7756   if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7757     MDNode *OrigLoopID = OrigLoop->getLoopID();
7758 
7759     std::optional<MDNode *> VectorizedLoopID =
7760         makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7761                                         LLVMLoopVectorizeFollowupVectorized});
7762 
7763     VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7764     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7765     if (VectorizedLoopID) {
7766       L->setLoopID(*VectorizedLoopID);
7767     } else {
7768       // Keep all loop hints from the original loop on the vector loop (we'll
7769       // replace the vectorizer-specific hints below).
7770       if (MDNode *LID = OrigLoop->getLoopID())
7771         L->setLoopID(LID);
7772 
7773       LoopVectorizeHints Hints(L, true, *ORE);
7774       Hints.setAlreadyVectorized();
7775     }
7776     TargetTransformInfo::UnrollingPreferences UP;
7777     TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7778     if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7779       addRuntimeUnrollDisableMetaData(L);
7780   }
7781 
7782   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7783   //    predication, updating analyses.
7784   ILV.fixVectorizedLoop(State);
7785 
7786   ILV.printDebugTracesAtEnd();
7787 
7788   // 4. Adjust branch weight of the branch in the middle block.
7789   if (BestVPlan.getVectorLoopRegion()) {
7790     auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7791     auto *MiddleTerm =
7792         cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7793     if (MiddleTerm->isConditional() &&
7794         hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7795       // Assume that `Count % VectorTripCount` is equally distributed.
7796       unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7797       assert(TripCount > 0 && "trip count should not be zero");
7798       const uint32_t Weights[] = {1, TripCount - 1};
7799       setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7800     }
7801   }
7802 
7803   return State.ExpandedSCEVs;
7804 }
7805 
7806 //===--------------------------------------------------------------------===//
7807 // EpilogueVectorizerMainLoop
7808 //===--------------------------------------------------------------------===//
7809 
7810 /// This function is partially responsible for generating the control flow
7811 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7812 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7813     const SCEV2ValueTy &ExpandedSCEVs) {
7814   createVectorLoopSkeleton("");
7815 
7816   // Generate the code to check the minimum iteration count of the vector
7817   // epilogue (see below).
7818   EPI.EpilogueIterationCountCheck =
7819       emitIterationCountCheck(LoopScalarPreHeader, true);
7820   EPI.EpilogueIterationCountCheck->setName("iter.check");
7821 
7822   // Generate the code to check any assumptions that we've made for SCEV
7823   // expressions.
7824   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7825 
7826   // Generate the code that checks at runtime if arrays overlap. We put the
7827   // checks into a separate block to make the more common case of few elements
7828   // faster.
7829   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7830 
7831   // Generate the iteration count check for the main loop, *after* the check
7832   // for the epilogue loop, so that the path-length is shorter for the case
7833   // that goes directly through the vector epilogue. The longer-path length for
7834   // the main loop is compensated for, by the gain from vectorizing the larger
7835   // trip count. Note: the branch will get updated later on when we vectorize
7836   // the epilogue.
7837   EPI.MainLoopIterationCountCheck =
7838       emitIterationCountCheck(LoopScalarPreHeader, false);
7839 
7840   // Generate the induction variable.
7841   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7842 
7843   return LoopVectorPreHeader;
7844 }
7845 
7846 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7847   LLVM_DEBUG({
7848     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7849            << "Main Loop VF:" << EPI.MainLoopVF
7850            << ", Main Loop UF:" << EPI.MainLoopUF
7851            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7852            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7853   });
7854 }
7855 
7856 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7857   DEBUG_WITH_TYPE(VerboseDebug, {
7858     dbgs() << "intermediate fn:\n"
7859            << *OrigLoop->getHeader()->getParent() << "\n";
7860   });
7861 }
7862 
7863 BasicBlock *
7864 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7865                                                     bool ForEpilogue) {
7866   assert(Bypass && "Expected valid bypass basic block.");
7867   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7868   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7869   Value *Count = getTripCount();
7870   // Reuse existing vector loop preheader for TC checks.
7871   // Note that new preheader block is generated for vector loop.
7872   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7873   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7874 
7875   // Generate code to check if the loop's trip count is less than VF * UF of the
7876   // main vector loop.
7877   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7878                                                     : VF.isVector())
7879                ? ICmpInst::ICMP_ULE
7880                : ICmpInst::ICMP_ULT;
7881 
7882   Value *CheckMinIters = Builder.CreateICmp(
7883       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7884       "min.iters.check");
7885 
7886   if (!ForEpilogue)
7887     TCCheckBlock->setName("vector.main.loop.iter.check");
7888 
7889   // Create new preheader for vector loop.
7890   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7891                                    DT, LI, nullptr, "vector.ph");
7892 
7893   if (ForEpilogue) {
7894     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7895                                  DT->getNode(Bypass)->getIDom()) &&
7896            "TC check is expected to dominate Bypass");
7897 
7898     LoopBypassBlocks.push_back(TCCheckBlock);
7899 
7900     // Save the trip count so we don't have to regenerate it in the
7901     // vec.epilog.iter.check. This is safe to do because the trip count
7902     // generated here dominates the vector epilog iter check.
7903     EPI.TripCount = Count;
7904   }
7905 
7906   BranchInst &BI =
7907       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7908   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7909     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7910   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7911 
7912   introduceCheckBlockInVPlan(TCCheckBlock);
7913   return TCCheckBlock;
7914 }
7915 
7916 //===--------------------------------------------------------------------===//
7917 // EpilogueVectorizerEpilogueLoop
7918 //===--------------------------------------------------------------------===//
7919 
7920 /// This function is partially responsible for generating the control flow
7921 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7922 BasicBlock *
7923 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7924     const SCEV2ValueTy &ExpandedSCEVs) {
7925   createVectorLoopSkeleton("vec.epilog.");
7926 
7927   // Now, compare the remaining count and if there aren't enough iterations to
7928   // execute the vectorized epilogue skip to the scalar part.
7929   LoopVectorPreHeader->setName("vec.epilog.ph");
7930   BasicBlock *VecEpilogueIterationCountCheck =
7931       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
7932                  nullptr, "vec.epilog.iter.check", true);
7933   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7934                                           VecEpilogueIterationCountCheck);
7935   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7936 
7937   // Adjust the control flow taking the state info from the main loop
7938   // vectorization into account.
7939   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7940          "expected this to be saved from the previous pass.");
7941   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7942       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7943 
7944   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7945       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7946 
7947   if (EPI.SCEVSafetyCheck)
7948     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7949         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7950   if (EPI.MemSafetyCheck)
7951     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7952         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7953 
7954   DT->changeImmediateDominator(LoopScalarPreHeader,
7955                                EPI.EpilogueIterationCountCheck);
7956   // Keep track of bypass blocks, as they feed start values to the induction and
7957   // reduction phis in the scalar loop preheader.
7958   if (EPI.SCEVSafetyCheck)
7959     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7960   if (EPI.MemSafetyCheck)
7961     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7962   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7963 
7964   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7965   // reductions which merge control-flow from the latch block and the middle
7966   // block. Update the incoming values here and move the Phi into the preheader.
7967   SmallVector<PHINode *, 4> PhisInBlock;
7968   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7969     PhisInBlock.push_back(&Phi);
7970 
7971   for (PHINode *Phi : PhisInBlock) {
7972     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7973     Phi->replaceIncomingBlockWith(
7974         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7975         VecEpilogueIterationCountCheck);
7976 
7977     // If the phi doesn't have an incoming value from the
7978     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7979     // value and also those from other check blocks. This is needed for
7980     // reduction phis only.
7981     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7982           return EPI.EpilogueIterationCountCheck == IncB;
7983         }))
7984       continue;
7985     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7986     if (EPI.SCEVSafetyCheck)
7987       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7988     if (EPI.MemSafetyCheck)
7989       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7990   }
7991 
7992   // Generate bypass values from the additional bypass block. Note that when the
7993   // vectorized epilogue is skipped due to iteration count check, then the
7994   // resume value for the induction variable comes from the trip count of the
7995   // main vector loop, passed as the second argument.
7996   createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
7997   return LoopVectorPreHeader;
7998 }
7999 
8000 BasicBlock *
8001 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8002     BasicBlock *Bypass, BasicBlock *Insert) {
8003 
8004   assert(EPI.TripCount &&
8005          "Expected trip count to have been saved in the first pass.");
8006   assert(
8007       (!isa<Instruction>(EPI.TripCount) ||
8008        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8009       "saved trip count does not dominate insertion point.");
8010   Value *TC = EPI.TripCount;
8011   IRBuilder<> Builder(Insert->getTerminator());
8012   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8013 
8014   // Generate code to check if the loop's trip count is less than VF * UF of the
8015   // vector epilogue loop.
8016   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8017                ? ICmpInst::ICMP_ULE
8018                : ICmpInst::ICMP_ULT;
8019 
8020   Value *CheckMinIters =
8021       Builder.CreateICmp(P, Count,
8022                          createStepForVF(Builder, Count->getType(),
8023                                          EPI.EpilogueVF, EPI.EpilogueUF),
8024                          "min.epilog.iters.check");
8025 
8026   BranchInst &BI =
8027       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8028   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
8029     unsigned MainLoopStep = UF * VF.getKnownMinValue();
8030     unsigned EpilogueLoopStep =
8031         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
8032     // We assume the remaining `Count` is equally distributed in
8033     // [0, MainLoopStep)
8034     // So the probability for `Count < EpilogueLoopStep` should be
8035     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8036     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8037     const uint32_t Weights[] = {EstimatedSkipCount,
8038                                 MainLoopStep - EstimatedSkipCount};
8039     setBranchWeights(BI, Weights, /*IsExpected=*/false);
8040   }
8041   ReplaceInstWithInst(Insert->getTerminator(), &BI);
8042   LoopBypassBlocks.push_back(Insert);
8043 
8044   // A new entry block has been created for the epilogue VPlan. Hook it in, as
8045   // otherwise we would try to modify the entry to the main vector loop.
8046   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8047   VPBasicBlock *OldEntry = Plan.getEntry();
8048   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8049   Plan.setEntry(NewEntry);
8050   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8051 
8052   introduceCheckBlockInVPlan(Insert);
8053   return Insert;
8054 }
8055 
8056 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8057   LLVM_DEBUG({
8058     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8059            << "Epilogue Loop VF:" << EPI.EpilogueVF
8060            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8061   });
8062 }
8063 
8064 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8065   DEBUG_WITH_TYPE(VerboseDebug, {
8066     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8067   });
8068 }
8069 
8070 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8071 VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
8072   std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8073     return getVPValueOrAddLiveIn(Op);
8074   };
8075   return map_range(Operands, Fn);
8076 }
8077 
8078 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
8079   BasicBlock *Src = SI->getParent();
8080   assert(!OrigLoop->isLoopExiting(Src) &&
8081          all_of(successors(Src),
8082                 [this](BasicBlock *Succ) {
8083                   return OrigLoop->getHeader() != Succ;
8084                 }) &&
8085          "unsupported switch either exiting loop or continuing to header");
8086   // Create masks where the terminator in Src is a switch. We create mask for
8087   // all edges at the same time. This is more efficient, as we can create and
8088   // collect compares for all cases once.
8089   VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8090   BasicBlock *DefaultDst = SI->getDefaultDest();
8091   MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
8092   for (auto &C : SI->cases()) {
8093     BasicBlock *Dst = C.getCaseSuccessor();
8094     assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8095     // Cases whose destination is the same as default are redundant and can be
8096     // ignored - they will get there anyhow.
8097     if (Dst == DefaultDst)
8098       continue;
8099     auto &Compares = Dst2Compares[Dst];
8100     VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8101     Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8102   }
8103 
8104   // We need to handle 2 separate cases below for all entries in Dst2Compares,
8105   // which excludes destinations matching the default destination.
8106   VPValue *SrcMask = getBlockInMask(Src);
8107   VPValue *DefaultMask = nullptr;
8108   for (const auto &[Dst, Conds] : Dst2Compares) {
8109     // 1. Dst is not the default destination. Dst is reached if any of the cases
8110     // with destination == Dst are taken. Join the conditions for each case
8111     // whose destination == Dst using an OR.
8112     VPValue *Mask = Conds[0];
8113     for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8114       Mask = Builder.createOr(Mask, V);
8115     if (SrcMask)
8116       Mask = Builder.createLogicalAnd(SrcMask, Mask);
8117     EdgeMaskCache[{Src, Dst}] = Mask;
8118 
8119     // 2. Create the mask for the default destination, which is reached if none
8120     // of the cases with destination != default destination are taken. Join the
8121     // conditions for each case where the destination is != Dst using an OR and
8122     // negate it.
8123     DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8124   }
8125 
8126   if (DefaultMask) {
8127     DefaultMask = Builder.createNot(DefaultMask);
8128     if (SrcMask)
8129       DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8130   }
8131   EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8132 }
8133 
8134 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
8135   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8136 
8137   // Look for cached value.
8138   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8139   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8140   if (ECEntryIt != EdgeMaskCache.end())
8141     return ECEntryIt->second;
8142 
8143   if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8144     createSwitchEdgeMasks(SI);
8145     assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8146     return EdgeMaskCache[Edge];
8147   }
8148 
8149   VPValue *SrcMask = getBlockInMask(Src);
8150 
8151   // The terminator has to be a branch inst!
8152   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8153   assert(BI && "Unexpected terminator found");
8154   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8155     return EdgeMaskCache[Edge] = SrcMask;
8156 
8157   // If source is an exiting block, we know the exit edge is dynamically dead
8158   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8159   // adding uses of an otherwise potentially dead instruction unless we are
8160   // vectorizing a loop with uncountable exits. In that case, we always
8161   // materialize the mask.
8162   if (OrigLoop->isLoopExiting(Src) &&
8163       Src != Legal->getUncountableEarlyExitingBlock())
8164     return EdgeMaskCache[Edge] = SrcMask;
8165 
8166   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8167   assert(EdgeMask && "No Edge Mask found for condition");
8168 
8169   if (BI->getSuccessor(0) != Dst)
8170     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8171 
8172   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8173     // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8174     // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8175     // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8176     EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8177   }
8178 
8179   return EdgeMaskCache[Edge] = EdgeMask;
8180 }
8181 
8182 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
8183   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8184 
8185   // Look for cached value.
8186   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8187   EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8188   assert(ECEntryIt != EdgeMaskCache.end() &&
8189          "looking up mask for edge which has not been created");
8190   return ECEntryIt->second;
8191 }
8192 
8193 void VPRecipeBuilder::createHeaderMask() {
8194   BasicBlock *Header = OrigLoop->getHeader();
8195 
8196   // When not folding the tail, use nullptr to model all-true mask.
8197   if (!CM.foldTailByMasking()) {
8198     BlockMaskCache[Header] = nullptr;
8199     return;
8200   }
8201 
8202   // Introduce the early-exit compare IV <= BTC to form header block mask.
8203   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8204   // constructing the desired canonical IV in the header block as its first
8205   // non-phi instructions.
8206 
8207   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8208   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8209   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8210   HeaderVPBB->insert(IV, NewInsertionPoint);
8211 
8212   VPBuilder::InsertPointGuard Guard(Builder);
8213   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8214   VPValue *BlockMask = nullptr;
8215   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8216   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8217   BlockMaskCache[Header] = BlockMask;
8218 }
8219 
8220 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8221   // Return the cached value.
8222   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8223   assert(BCEntryIt != BlockMaskCache.end() &&
8224          "Trying to access mask for block without one.");
8225   return BCEntryIt->second;
8226 }
8227 
8228 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
8229   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8230   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8231   assert(OrigLoop->getHeader() != BB &&
8232          "Loop header must have cached block mask");
8233 
8234   // All-one mask is modelled as no-mask following the convention for masked
8235   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8236   VPValue *BlockMask = nullptr;
8237   // This is the block mask. We OR all unique incoming edges.
8238   for (auto *Predecessor :
8239        SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) {
8240     VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8241     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8242       BlockMaskCache[BB] = EdgeMask;
8243       return;
8244     }
8245 
8246     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8247       BlockMask = EdgeMask;
8248       continue;
8249     }
8250 
8251     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8252   }
8253 
8254   BlockMaskCache[BB] = BlockMask;
8255 }
8256 
8257 VPWidenMemoryRecipe *
8258 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8259                                   VFRange &Range) {
8260   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8261          "Must be called with either a load or store");
8262 
8263   auto WillWiden = [&](ElementCount VF) -> bool {
8264     LoopVectorizationCostModel::InstWidening Decision =
8265         CM.getWideningDecision(I, VF);
8266     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8267            "CM decision should be taken at this point.");
8268     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8269       return true;
8270     if (CM.isScalarAfterVectorization(I, VF) ||
8271         CM.isProfitableToScalarize(I, VF))
8272       return false;
8273     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8274   };
8275 
8276   if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
8277     return nullptr;
8278 
8279   VPValue *Mask = nullptr;
8280   if (Legal->isMaskRequired(I))
8281     Mask = getBlockInMask(I->getParent());
8282 
8283   // Determine if the pointer operand of the access is either consecutive or
8284   // reverse consecutive.
8285   LoopVectorizationCostModel::InstWidening Decision =
8286       CM.getWideningDecision(I, Range.Start);
8287   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8288   bool Consecutive =
8289       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8290 
8291   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8292   if (Consecutive) {
8293     auto *GEP = dyn_cast<GetElementPtrInst>(
8294         Ptr->getUnderlyingValue()->stripPointerCasts());
8295     VPSingleDefRecipe *VectorPtr;
8296     if (Reverse) {
8297       // When folding the tail, we may compute an address that we don't in the
8298       // original scalar loop and it may not be inbounds. Drop Inbounds in that
8299       // case.
8300       GEPNoWrapFlags Flags =
8301           (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8302               ? GEPNoWrapFlags::none()
8303               : GEPNoWrapFlags::inBounds();
8304       VectorPtr = new VPReverseVectorPointerRecipe(
8305           Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8306     } else {
8307       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8308                                             GEP ? GEP->getNoWrapFlags()
8309                                                 : GEPNoWrapFlags::none(),
8310                                             I->getDebugLoc());
8311     }
8312     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8313     Ptr = VectorPtr;
8314   }
8315   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8316     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8317                                  I->getDebugLoc());
8318 
8319   StoreInst *Store = cast<StoreInst>(I);
8320   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8321                                 Reverse, I->getDebugLoc());
8322 }
8323 
8324 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8325 /// insert a recipe to expand the step for the induction recipe.
8326 static VPWidenIntOrFpInductionRecipe *
8327 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8328                             VPValue *Start, const InductionDescriptor &IndDesc,
8329                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8330   assert(IndDesc.getStartValue() ==
8331          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8332   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8333          "step must be loop invariant");
8334 
8335   VPValue *Step =
8336       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8337   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8338     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8339                                              IndDesc, TruncI,
8340                                              TruncI->getDebugLoc());
8341   }
8342   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8343   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8344                                            IndDesc, Phi->getDebugLoc());
8345 }
8346 
8347 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8348     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
8349 
8350   // Check if this is an integer or fp induction. If so, build the recipe that
8351   // produces its scalar and vector values.
8352   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8353     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8354                                        *PSE.getSE(), *OrigLoop);
8355 
8356   // Check if this is pointer induction. If so, build the recipe for it.
8357   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8358     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8359                                                            *PSE.getSE());
8360     return new VPWidenPointerInductionRecipe(
8361         Phi, Operands[0], Step, *II,
8362         LoopVectorizationPlanner::getDecisionAndClampRange(
8363             [&](ElementCount VF) {
8364               return CM.isScalarAfterVectorization(Phi, VF);
8365             },
8366             Range),
8367         Phi->getDebugLoc());
8368   }
8369   return nullptr;
8370 }
8371 
8372 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8373     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
8374   // Optimize the special case where the source is a constant integer
8375   // induction variable. Notice that we can only optimize the 'trunc' case
8376   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8377   // (c) other casts depend on pointer size.
8378 
8379   // Determine whether \p K is a truncation based on an induction variable that
8380   // can be optimized.
8381   auto IsOptimizableIVTruncate =
8382       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8383     return [=](ElementCount VF) -> bool {
8384       return CM.isOptimizableIVTruncate(K, VF);
8385     };
8386   };
8387 
8388   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8389           IsOptimizableIVTruncate(I), Range)) {
8390 
8391     auto *Phi = cast<PHINode>(I->getOperand(0));
8392     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8393     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8394     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8395                                        *OrigLoop);
8396   }
8397   return nullptr;
8398 }
8399 
8400 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8401                                            ArrayRef<VPValue *> Operands) {
8402   unsigned NumIncoming = Phi->getNumIncomingValues();
8403 
8404   // We know that all PHIs in non-header blocks are converted into selects, so
8405   // we don't have to worry about the insertion order and we can just use the
8406   // builder. At this point we generate the predication tree. There may be
8407   // duplications since this is a simple recursive scan, but future
8408   // optimizations will clean it up.
8409   SmallVector<VPValue *, 2> OperandsWithMask;
8410 
8411   for (unsigned In = 0; In < NumIncoming; In++) {
8412     OperandsWithMask.push_back(Operands[In]);
8413     VPValue *EdgeMask =
8414         getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8415     if (!EdgeMask) {
8416       assert(In == 0 && "Both null and non-null edge masks found");
8417       assert(all_equal(Operands) &&
8418              "Distinct incoming values with one having a full mask");
8419       break;
8420     }
8421     OperandsWithMask.push_back(EdgeMask);
8422   }
8423   return new VPBlendRecipe(Phi, OperandsWithMask);
8424 }
8425 
8426 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8427                                                    ArrayRef<VPValue *> Operands,
8428                                                    VFRange &Range) {
8429   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8430       [this, CI](ElementCount VF) {
8431         return CM.isScalarWithPredication(CI, VF);
8432       },
8433       Range);
8434 
8435   if (IsPredicated)
8436     return nullptr;
8437 
8438   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8439   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8440              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8441              ID == Intrinsic::pseudoprobe ||
8442              ID == Intrinsic::experimental_noalias_scope_decl))
8443     return nullptr;
8444 
8445   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8446 
8447   // Is it beneficial to perform intrinsic call compared to lib call?
8448   bool ShouldUseVectorIntrinsic =
8449       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8450                 [&](ElementCount VF) -> bool {
8451                   return CM.getCallWideningDecision(CI, VF).Kind ==
8452                          LoopVectorizationCostModel::CM_IntrinsicCall;
8453                 },
8454                 Range);
8455   if (ShouldUseVectorIntrinsic)
8456     return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8457                                       CI->getDebugLoc());
8458 
8459   Function *Variant = nullptr;
8460   std::optional<unsigned> MaskPos;
8461   // Is better to call a vectorized version of the function than to to scalarize
8462   // the call?
8463   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8464       [&](ElementCount VF) -> bool {
8465         // The following case may be scalarized depending on the VF.
8466         // The flag shows whether we can use a usual Call for vectorized
8467         // version of the instruction.
8468 
8469         // If we've found a variant at a previous VF, then stop looking. A
8470         // vectorized variant of a function expects input in a certain shape
8471         // -- basically the number of input registers, the number of lanes
8472         // per register, and whether there's a mask required.
8473         // We store a pointer to the variant in the VPWidenCallRecipe, so
8474         // once we have an appropriate variant it's only valid for that VF.
8475         // This will force a different vplan to be generated for each VF that
8476         // finds a valid variant.
8477         if (Variant)
8478           return false;
8479         LoopVectorizationCostModel::CallWideningDecision Decision =
8480             CM.getCallWideningDecision(CI, VF);
8481         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8482           Variant = Decision.Variant;
8483           MaskPos = Decision.MaskPos;
8484           return true;
8485         }
8486 
8487         return false;
8488       },
8489       Range);
8490   if (ShouldUseVectorCall) {
8491     if (MaskPos.has_value()) {
8492       // We have 2 cases that would require a mask:
8493       //   1) The block needs to be predicated, either due to a conditional
8494       //      in the scalar loop or use of an active lane mask with
8495       //      tail-folding, and we use the appropriate mask for the block.
8496       //   2) No mask is required for the block, but the only available
8497       //      vector variant at this VF requires a mask, so we synthesize an
8498       //      all-true mask.
8499       VPValue *Mask = nullptr;
8500       if (Legal->isMaskRequired(CI))
8501         Mask = getBlockInMask(CI->getParent());
8502       else
8503         Mask = Plan.getOrAddLiveIn(
8504             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
8505 
8506       Ops.insert(Ops.begin() + *MaskPos, Mask);
8507     }
8508 
8509     Ops.push_back(Operands.back());
8510     return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8511   }
8512 
8513   return nullptr;
8514 }
8515 
8516 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8517   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8518          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8519   // Instruction should be widened, unless it is scalar after vectorization,
8520   // scalarization is profitable or it is predicated.
8521   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8522     return CM.isScalarAfterVectorization(I, VF) ||
8523            CM.isProfitableToScalarize(I, VF) ||
8524            CM.isScalarWithPredication(I, VF);
8525   };
8526   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8527                                                              Range);
8528 }
8529 
8530 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8531                                            ArrayRef<VPValue *> Operands,
8532                                            VPBasicBlock *VPBB) {
8533   switch (I->getOpcode()) {
8534   default:
8535     return nullptr;
8536   case Instruction::SDiv:
8537   case Instruction::UDiv:
8538   case Instruction::SRem:
8539   case Instruction::URem: {
8540     // If not provably safe, use a select to form a safe divisor before widening the
8541     // div/rem operation itself.  Otherwise fall through to general handling below.
8542     if (CM.isPredicatedInst(I)) {
8543       SmallVector<VPValue *> Ops(Operands);
8544       VPValue *Mask = getBlockInMask(I->getParent());
8545       VPValue *One =
8546           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8547       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8548       Ops[1] = SafeRHS;
8549       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8550     }
8551     [[fallthrough]];
8552   }
8553   case Instruction::Add:
8554   case Instruction::And:
8555   case Instruction::AShr:
8556   case Instruction::FAdd:
8557   case Instruction::FCmp:
8558   case Instruction::FDiv:
8559   case Instruction::FMul:
8560   case Instruction::FNeg:
8561   case Instruction::FRem:
8562   case Instruction::FSub:
8563   case Instruction::ICmp:
8564   case Instruction::LShr:
8565   case Instruction::Mul:
8566   case Instruction::Or:
8567   case Instruction::Select:
8568   case Instruction::Shl:
8569   case Instruction::Sub:
8570   case Instruction::Xor:
8571   case Instruction::Freeze:
8572     SmallVector<VPValue *> NewOps(Operands);
8573     if (Instruction::isBinaryOp(I->getOpcode())) {
8574       // The legacy cost model uses SCEV to check if some of the operands are
8575       // constants. To match the legacy cost model's behavior, use SCEV to try
8576       // to replace operands with constants.
8577       ScalarEvolution &SE = *PSE.getSE();
8578       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8579         Value *V = Op->getUnderlyingValue();
8580         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8581           return Op;
8582         auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8583         if (!C)
8584           return Op;
8585         return Plan.getOrAddLiveIn(C->getValue());
8586       };
8587       // For Mul, the legacy cost model checks both operands.
8588       if (I->getOpcode() == Instruction::Mul)
8589         NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8590       // For other binops, the legacy cost model only checks the second operand.
8591       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8592     }
8593     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8594   };
8595 }
8596 
8597 VPHistogramRecipe *
8598 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8599                                      ArrayRef<VPValue *> Operands) {
8600   // FIXME: Support other operations.
8601   unsigned Opcode = HI->Update->getOpcode();
8602   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8603          "Histogram update operation must be an Add or Sub");
8604 
8605   SmallVector<VPValue *, 3> HGramOps;
8606   // Bucket address.
8607   HGramOps.push_back(Operands[1]);
8608   // Increment value.
8609   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8610 
8611   // In case of predicated execution (due to tail-folding, or conditional
8612   // execution, or both), pass the relevant mask.
8613   if (Legal->isMaskRequired(HI->Store))
8614     HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8615 
8616   return new VPHistogramRecipe(Opcode,
8617                                make_range(HGramOps.begin(), HGramOps.end()),
8618                                HI->Store->getDebugLoc());
8619 }
8620 
8621 void VPRecipeBuilder::fixHeaderPhis() {
8622   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8623   for (VPHeaderPHIRecipe *R : PhisToFix) {
8624     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8625     VPRecipeBase *IncR =
8626         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8627     R->addOperand(IncR->getVPSingleValue());
8628   }
8629 }
8630 
8631 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8632                                                       VFRange &Range) {
8633   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8634       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8635       Range);
8636 
8637   bool IsPredicated = CM.isPredicatedInst(I);
8638 
8639   // Even if the instruction is not marked as uniform, there are certain
8640   // intrinsic calls that can be effectively treated as such, so we check for
8641   // them here. Conservatively, we only do this for scalable vectors, since
8642   // for fixed-width VFs we can always fall back on full scalarization.
8643   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8644     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8645     case Intrinsic::assume:
8646     case Intrinsic::lifetime_start:
8647     case Intrinsic::lifetime_end:
8648       // For scalable vectors if one of the operands is variant then we still
8649       // want to mark as uniform, which will generate one instruction for just
8650       // the first lane of the vector. We can't scalarize the call in the same
8651       // way as for fixed-width vectors because we don't know how many lanes
8652       // there are.
8653       //
8654       // The reasons for doing it this way for scalable vectors are:
8655       //   1. For the assume intrinsic generating the instruction for the first
8656       //      lane is still be better than not generating any at all. For
8657       //      example, the input may be a splat across all lanes.
8658       //   2. For the lifetime start/end intrinsics the pointer operand only
8659       //      does anything useful when the input comes from a stack object,
8660       //      which suggests it should always be uniform. For non-stack objects
8661       //      the effect is to poison the object, which still allows us to
8662       //      remove the call.
8663       IsUniform = true;
8664       break;
8665     default:
8666       break;
8667     }
8668   }
8669   VPValue *BlockInMask = nullptr;
8670   if (!IsPredicated) {
8671     // Finalize the recipe for Instr, first if it is not predicated.
8672     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8673   } else {
8674     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8675     // Instructions marked for predication are replicated and a mask operand is
8676     // added initially. Masked replicate recipes will later be placed under an
8677     // if-then construct to prevent side-effects. Generate recipes to compute
8678     // the block mask for this region.
8679     BlockInMask = getBlockInMask(I->getParent());
8680   }
8681 
8682   // Note that there is some custom logic to mark some intrinsics as uniform
8683   // manually above for scalable vectors, which this assert needs to account for
8684   // as well.
8685   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8686           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8687          "Should not predicate a uniform recipe");
8688   auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8689                                        IsUniform, BlockInMask);
8690   return Recipe;
8691 }
8692 
8693 /// Find all possible partial reductions in the loop and track all of those that
8694 /// are valid so recipes can be formed later.
8695 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8696   // Find all possible partial reductions.
8697   SmallVector<std::pair<PartialReductionChain, unsigned>>
8698       PartialReductionChains;
8699   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8700     getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
8701                         PartialReductionChains);
8702   }
8703 
8704   // A partial reduction is invalid if any of its extends are used by
8705   // something that isn't another partial reduction. This is because the
8706   // extends are intended to be lowered along with the reduction itself.
8707 
8708   // Build up a set of partial reduction bin ops for efficient use checking.
8709   SmallSet<User *, 4> PartialReductionBinOps;
8710   for (const auto &[PartialRdx, _] : PartialReductionChains)
8711     PartialReductionBinOps.insert(PartialRdx.BinOp);
8712 
8713   auto ExtendIsOnlyUsedByPartialReductions =
8714       [&PartialReductionBinOps](Instruction *Extend) {
8715         return all_of(Extend->users(), [&](const User *U) {
8716           return PartialReductionBinOps.contains(U);
8717         });
8718       };
8719 
8720   // Check if each use of a chain's two extends is a partial reduction
8721   // and only add those that don't have non-partial reduction users.
8722   for (auto Pair : PartialReductionChains) {
8723     PartialReductionChain Chain = Pair.first;
8724     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8725         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8726       ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));
8727   }
8728 }
8729 
8730 bool VPRecipeBuilder::getScaledReductions(
8731     Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8732     SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8733 
8734   if (!CM.TheLoop->contains(RdxExitInstr))
8735     return false;
8736 
8737   // TODO: Allow scaling reductions when predicating. The select at
8738   // the end of the loop chooses between the phi value and most recent
8739   // reduction result, both of which have different VFs to the active lane
8740   // mask when scaling.
8741   if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent()))
8742     return false;
8743 
8744   auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8745   if (!Update)
8746     return false;
8747 
8748   Value *Op = Update->getOperand(0);
8749   Value *PhiOp = Update->getOperand(1);
8750   if (Op == PHI)
8751     std::swap(Op, PhiOp);
8752 
8753   // Try and get a scaled reduction from the first non-phi operand.
8754   // If one is found, we use the discovered reduction instruction in
8755   // place of the accumulator for costing.
8756   if (auto *OpInst = dyn_cast<Instruction>(Op)) {
8757     if (getScaledReductions(PHI, OpInst, Range, Chains)) {
8758       PHI = Chains.rbegin()->first.Reduction;
8759 
8760       Op = Update->getOperand(0);
8761       PhiOp = Update->getOperand(1);
8762       if (Op == PHI)
8763         std::swap(Op, PhiOp);
8764     }
8765   }
8766   if (PhiOp != PHI)
8767     return false;
8768 
8769   auto *BinOp = dyn_cast<BinaryOperator>(Op);
8770   if (!BinOp || !BinOp->hasOneUse())
8771     return false;
8772 
8773   using namespace llvm::PatternMatch;
8774   Value *A, *B;
8775   if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8776       !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8777     return false;
8778 
8779   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8780   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8781 
8782   TTI::PartialReductionExtendKind OpAExtend =
8783       TargetTransformInfo::getPartialReductionExtendKind(ExtA);
8784   TTI::PartialReductionExtendKind OpBExtend =
8785       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
8786 
8787   PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);
8788 
8789   unsigned TargetScaleFactor =
8790       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8791           A->getType()->getPrimitiveSizeInBits());
8792 
8793   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8794           [&](ElementCount VF) {
8795             InstructionCost Cost = TTI->getPartialReductionCost(
8796                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8797                 VF, OpAExtend, OpBExtend,
8798                 std::make_optional(BinOp->getOpcode()));
8799             return Cost.isValid();
8800           },
8801           Range)) {
8802     Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
8803     return true;
8804   }
8805 
8806   return false;
8807 }
8808 
8809 VPRecipeBase *
8810 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8811                                         ArrayRef<VPValue *> Operands,
8812                                         VFRange &Range, VPBasicBlock *VPBB) {
8813   // First, check for specific widening recipes that deal with inductions, Phi
8814   // nodes, calls and memory operations.
8815   VPRecipeBase *Recipe;
8816   if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8817     if (Phi->getParent() != OrigLoop->getHeader())
8818       return tryToBlend(Phi, Operands);
8819 
8820     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8821       return Recipe;
8822 
8823     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8824     assert((Legal->isReductionVariable(Phi) ||
8825             Legal->isFixedOrderRecurrence(Phi)) &&
8826            "can only widen reductions and fixed-order recurrences here");
8827     VPValue *StartV = Operands[0];
8828     if (Legal->isReductionVariable(Phi)) {
8829       const RecurrenceDescriptor &RdxDesc =
8830           Legal->getReductionVars().find(Phi)->second;
8831       assert(RdxDesc.getRecurrenceStartValue() ==
8832              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8833 
8834       // If the PHI is used by a partial reduction, set the scale factor.
8835       unsigned ScaleFactor =
8836           getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8837       PhiRecipe = new VPReductionPHIRecipe(
8838           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8839           CM.useOrderedReductions(RdxDesc), ScaleFactor);
8840     } else {
8841       // TODO: Currently fixed-order recurrences are modeled as chains of
8842       // first-order recurrences. If there are no users of the intermediate
8843       // recurrences in the chain, the fixed order recurrence should be modeled
8844       // directly, enabling more efficient codegen.
8845       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8846     }
8847 
8848     PhisToFix.push_back(PhiRecipe);
8849     return PhiRecipe;
8850   }
8851 
8852   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8853                                     cast<TruncInst>(Instr), Operands, Range)))
8854     return Recipe;
8855 
8856   // All widen recipes below deal only with VF > 1.
8857   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8858           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8859     return nullptr;
8860 
8861   if (auto *CI = dyn_cast<CallInst>(Instr))
8862     return tryToWidenCall(CI, Operands, Range);
8863 
8864   if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8865     if (auto HistInfo = Legal->getHistogramInfo(SI))
8866       return tryToWidenHistogram(*HistInfo, Operands);
8867 
8868   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8869     return tryToWidenMemory(Instr, Operands, Range);
8870 
8871   if (getScalingForReduction(Instr))
8872     return tryToCreatePartialReduction(Instr, Operands);
8873 
8874   if (!shouldWiden(Instr, Range))
8875     return nullptr;
8876 
8877   if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8878     return new VPWidenGEPRecipe(GEP,
8879                                 make_range(Operands.begin(), Operands.end()));
8880 
8881   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8882     return new VPWidenSelectRecipe(
8883         *SI, make_range(Operands.begin(), Operands.end()));
8884   }
8885 
8886   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8887     return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8888                                  *CI);
8889   }
8890 
8891   return tryToWiden(Instr, Operands, VPBB);
8892 }
8893 
8894 VPRecipeBase *
8895 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8896                                              ArrayRef<VPValue *> Operands) {
8897   assert(Operands.size() == 2 &&
8898          "Unexpected number of operands for partial reduction");
8899 
8900   VPValue *BinOp = Operands[0];
8901   VPValue *Accumulator = Operands[1];
8902   VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8903   if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8904       isa<VPPartialReductionRecipe>(BinOpRecipe))
8905     std::swap(BinOp, Accumulator);
8906 
8907   return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp,
8908                                       Accumulator, Reduction);
8909 }
8910 
8911 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8912                                                         ElementCount MaxVF) {
8913   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8914 
8915   auto MaxVFTimes2 = MaxVF * 2;
8916   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8917     VFRange SubRange = {VF, MaxVFTimes2};
8918     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8919       // Now optimize the initial VPlan.
8920       if (!Plan->hasVF(ElementCount::getFixed(1)))
8921         VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
8922                                  *Plan, CM.getMinimalBitwidths());
8923       VPlanTransforms::optimize(*Plan);
8924       // TODO: try to put it close to addActiveLaneMask().
8925       // Discard the plan if it is not EVL-compatible
8926       if (CM.foldTailWithEVL() &&
8927           !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
8928                                     *Plan, CM.getMaxSafeElements()))
8929         break;
8930       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8931       VPlans.push_back(std::move(Plan));
8932     }
8933     VF = SubRange.End;
8934   }
8935 }
8936 
8937 // Add the necessary canonical IV and branch recipes required to control the
8938 // loop.
8939 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8940                                   DebugLoc DL) {
8941   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8942   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8943 
8944   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8945   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8946   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8947   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8948   Header->insert(CanonicalIVPHI, Header->begin());
8949 
8950   VPBuilder Builder(TopRegion->getExitingBasicBlock());
8951   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8952   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8953       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8954       "index.next");
8955   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8956 
8957   // Add the BranchOnCount VPInstruction to the latch.
8958   Builder.createNaryOp(VPInstruction::BranchOnCount,
8959                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8960 }
8961 
8962 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8963 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8964 /// the end value of the induction.
8965 static VPInstruction *addResumePhiRecipeForInduction(
8966     VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8967     VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8968   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8969   // Truncated wide inductions resume from the last lane of their vector value
8970   // in the last vector iteration which is handled elsewhere.
8971   if (WideIntOrFp && WideIntOrFp->getTruncInst())
8972     return nullptr;
8973 
8974   VPValue *Start = WideIV->getStartValue();
8975   VPValue *Step = WideIV->getStepValue();
8976   const InductionDescriptor &ID = WideIV->getInductionDescriptor();
8977   VPValue *EndValue = VectorTC;
8978   if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8979     EndValue = VectorPHBuilder.createDerivedIV(
8980         ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
8981         Start, VectorTC, Step);
8982   }
8983 
8984   // EndValue is derived from the vector trip count (which has the same type as
8985   // the widest induction) and thus may be wider than the induction here.
8986   Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
8987   if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
8988     EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
8989                                                 ScalarTypeOfWideIV,
8990                                                 WideIV->getDebugLoc());
8991   }
8992 
8993   auto *ResumePhiRecipe =
8994       ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
8995                                    WideIV->getDebugLoc(), "bc.resume.val");
8996   return ResumePhiRecipe;
8997 }
8998 
8999 /// Create resume phis in the scalar preheader for first-order recurrences,
9000 /// reductions and inductions, and update the VPIRInstructions wrapping the
9001 /// original phis in the scalar header. End values for inductions are added to
9002 /// \p IVEndValues.
9003 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
9004                                 DenseMap<VPValue *, VPValue *> &IVEndValues) {
9005   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9006   auto *ScalarPH = Plan.getScalarPreheader();
9007   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
9008   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9009   VPBuilder VectorPHBuilder(
9010       cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
9011   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9012   VPBuilder ScalarPHBuilder(ScalarPH);
9013   VPValue *OneVPV = Plan.getOrAddLiveIn(
9014       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
9015   for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
9016     auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9017     auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9018     if (!ScalarPhiI)
9019       break;
9020 
9021     // TODO: Extract final value from induction recipe initially, optimize to
9022     // pre-computed end value together in optimizeInductionExitUsers.
9023     auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9024     if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9025       if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
9026               WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9027               &Plan.getVectorTripCount())) {
9028         assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
9029                "Expected a ResumePhi");
9030         IVEndValues[WideIVR] = ResumePhi->getOperand(0);
9031         ScalarPhiIRI->addOperand(ResumePhi);
9032         continue;
9033       }
9034       // TODO: Also handle truncated inductions here. Computing end-values
9035       // separately should be done as VPlan-to-VPlan optimization, after
9036       // legalizing all resume values to use the last lane from the loop.
9037       assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9038              "should only skip truncated wide inductions");
9039       continue;
9040     }
9041 
9042     // The backedge value provides the value to resume coming out of a loop,
9043     // which for FORs is a vector whose last element needs to be extracted. The
9044     // start value provides the value if the loop is bypassed.
9045     bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9046     auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9047     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9048            "Cannot handle loops with uncountable early exits");
9049     if (IsFOR)
9050       ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9051           VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9052           "vector.recur.extract");
9053     StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9054     auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9055         VPInstruction::ResumePhi,
9056         {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9057     ScalarPhiIRI->addOperand(ResumePhiR);
9058   }
9059 }
9060 
9061 // Collect VPIRInstructions for phis in the exit blocks that are modeled
9062 // in VPlan and add the exiting VPValue as operand.
9063 static SetVector<VPIRInstruction *>
9064 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
9065                          VPlan &Plan) {
9066   SetVector<VPIRInstruction *> ExitUsersToFix;
9067   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9068     for (VPRecipeBase &R : *ExitVPBB) {
9069       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9070       if (!ExitIRI)
9071         continue;
9072       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9073       if (!ExitPhi)
9074         break;
9075       if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock()) {
9076         assert(ExitIRI->getNumOperands() ==
9077                    ExitVPBB->getPredecessors().size() &&
9078                "early-exit must update exit values on construction");
9079         continue;
9080       }
9081       BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9082       Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9083       VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9084       ExitIRI->addOperand(V);
9085       if (V->isLiveIn())
9086         continue;
9087       assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
9088              "Only recipes defined inside a region should need fixing.");
9089       ExitUsersToFix.insert(ExitIRI);
9090     }
9091   }
9092   return ExitUsersToFix;
9093 }
9094 
9095 // Add exit values to \p Plan. Extracts are added for each entry in \p
9096 // ExitUsersToFix if needed and their operands are updated.
9097 static void
9098 addUsersInExitBlocks(VPlan &Plan,
9099                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9100   if (ExitUsersToFix.empty())
9101     return;
9102 
9103   auto *MiddleVPBB = Plan.getMiddleBlock();
9104   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9105 
9106   // Introduce extract for exiting values and update the VPIRInstructions
9107   // modeling the corresponding LCSSA phis.
9108   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9109     assert(ExitIRI->getNumOperands() == 1 &&
9110            ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
9111            "exit values from early exits must be fixed when branch to "
9112            "early-exit is added");
9113     ExitIRI->extractLastLaneOfOperand(B);
9114   }
9115 }
9116 
9117 /// Handle users in the exit block for first order reductions in the original
9118 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9119 /// users in the original exit block using the VPIRInstruction wrapping to the
9120 /// LCSSA phi.
9121 static void addExitUsersForFirstOrderRecurrences(
9122     VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9123   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9124   auto *ScalarPHVPBB = Plan.getScalarPreheader();
9125   auto *MiddleVPBB = Plan.getMiddleBlock();
9126   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9127   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9128   VPValue *TwoVPV = Plan.getOrAddLiveIn(
9129       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9130 
9131   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9132     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9133     if (!FOR)
9134       continue;
9135 
9136     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9137            "Cannot handle loops with uncountable early exits");
9138 
9139     // This is the second phase of vectorizing first-order recurrences, creating
9140     // extract for users outside the loop. An overview of the transformation is
9141     // described below. Suppose we have the following loop with some use after
9142     // the loop of the last a[i-1],
9143     //
9144     //   for (int i = 0; i < n; ++i) {
9145     //     t = a[i - 1];
9146     //     b[i] = a[i] - t;
9147     //   }
9148     //   use t;
9149     //
9150     // There is a first-order recurrence on "a". For this loop, the shorthand
9151     // scalar IR looks like:
9152     //
9153     //   scalar.ph:
9154     //     s.init = a[-1]
9155     //     br scalar.body
9156     //
9157     //   scalar.body:
9158     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9159     //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9160     //     s2 = a[i]
9161     //     b[i] = s2 - s1
9162     //     br cond, scalar.body, exit.block
9163     //
9164     //   exit.block:
9165     //     use = lcssa.phi [s1, scalar.body]
9166     //
9167     // In this example, s1 is a recurrence because it's value depends on the
9168     // previous iteration. In the first phase of vectorization, we created a
9169     // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9170     // for users in the scalar preheader and exit block.
9171     //
9172     //   vector.ph:
9173     //     v_init = vector(..., ..., ..., a[-1])
9174     //     br vector.body
9175     //
9176     //   vector.body
9177     //     i = phi [0, vector.ph], [i+4, vector.body]
9178     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
9179     //     v2 = a[i, i+1, i+2, i+3]
9180     //     b[i] = v2 - v1
9181     //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9182     //     b[i, i+1, i+2, i+3] = v2 - v1
9183     //     br cond, vector.body, middle.block
9184     //
9185     //   middle.block:
9186     //     vector.recur.extract.for.phi = v2(2)
9187     //     vector.recur.extract = v2(3)
9188     //     br cond, scalar.ph, exit.block
9189     //
9190     //   scalar.ph:
9191     //     scalar.recur.init = phi [vector.recur.extract, middle.block],
9192     //                             [s.init, otherwise]
9193     //     br scalar.body
9194     //
9195     //   scalar.body:
9196     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9197     //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9198     //     s2 = a[i]
9199     //     b[i] = s2 - s1
9200     //     br cond, scalar.body, exit.block
9201     //
9202     //   exit.block:
9203     //     lo = lcssa.phi [s1, scalar.body],
9204     //                    [vector.recur.extract.for.phi, middle.block]
9205     //
9206     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9207     // Extract the penultimate value of the recurrence and use it as operand for
9208     // the VPIRInstruction modeling the phi.
9209     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9210       if (ExitIRI->getOperand(0) != FOR)
9211         continue;
9212       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9213           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9214           "vector.recur.extract.for.phi");
9215       ExitIRI->setOperand(0, PenultimateElement);
9216       ExitUsersToFix.remove(ExitIRI);
9217     }
9218   }
9219 }
9220 
9221 VPlanPtr
9222 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9223 
9224   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9225 
9226   // ---------------------------------------------------------------------------
9227   // Build initial VPlan: Scan the body of the loop in a topological order to
9228   // visit each basic block after having visited its predecessor basic blocks.
9229   // ---------------------------------------------------------------------------
9230 
9231   // Create initial VPlan skeleton, having a basic block for the pre-header
9232   // which contains SCEV expansions that need to happen before the CFG is
9233   // modified; a basic block for the vector pre-header, followed by a region for
9234   // the vector loop, followed by the middle basic block. The skeleton vector
9235   // loop region contains a header and latch basic blocks.
9236 
9237   bool RequiresScalarEpilogueCheck =
9238       LoopVectorizationPlanner::getDecisionAndClampRange(
9239           [this](ElementCount VF) {
9240             return !CM.requiresScalarEpilogue(VF.isVector());
9241           },
9242           Range);
9243   VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(),
9244                                             PSE, RequiresScalarEpilogueCheck,
9245                                             CM.foldTailByMasking(), OrigLoop);
9246 
9247   // Don't use getDecisionAndClampRange here, because we don't know the UF
9248   // so this function is better to be conservative, rather than to split
9249   // it up into different VPlans.
9250   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9251   bool IVUpdateMayOverflow = false;
9252   for (ElementCount VF : Range)
9253     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9254 
9255   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9256   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9257   // Use NUW for the induction increment if we proved that it won't overflow in
9258   // the vector loop or when not folding the tail. In the later case, we know
9259   // that the canonical induction increment will not overflow as the vector trip
9260   // count is >= increment and a multiple of the increment.
9261   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9262   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9263 
9264   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9265                                 Builder);
9266 
9267   // ---------------------------------------------------------------------------
9268   // Pre-construction: record ingredients whose recipes we'll need to further
9269   // process after constructing the initial VPlan.
9270   // ---------------------------------------------------------------------------
9271 
9272   // For each interleave group which is relevant for this (possibly trimmed)
9273   // Range, add it to the set of groups to be later applied to the VPlan and add
9274   // placeholders for its members' Recipes which we'll be replacing with a
9275   // single VPInterleaveRecipe.
9276   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9277     auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9278       bool Result = (VF.isVector() && // Query is illegal for VF == 1
9279                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
9280                          LoopVectorizationCostModel::CM_Interleave);
9281       // For scalable vectors, the only interleave factor currently supported
9282       // is 2 since we require the (de)interleave2 intrinsics instead of
9283       // shufflevectors.
9284       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9285              "Unsupported interleave factor for scalable vectors");
9286       return Result;
9287     };
9288     if (!getDecisionAndClampRange(ApplyIG, Range))
9289       continue;
9290     InterleaveGroups.insert(IG);
9291   }
9292 
9293   // ---------------------------------------------------------------------------
9294   // Construct recipes for the instructions in the loop
9295   // ---------------------------------------------------------------------------
9296 
9297   // Scan the body of the loop in a topological order to visit each basic block
9298   // after having visited its predecessor basic blocks.
9299   LoopBlocksDFS DFS(OrigLoop);
9300   DFS.perform(LI);
9301 
9302   VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9303   VPBasicBlock *VPBB = HeaderVPBB;
9304   BasicBlock *HeaderBB = OrigLoop->getHeader();
9305   bool NeedsMasks =
9306       CM.foldTailByMasking() ||
9307       any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9308         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9309         return Legal->blockNeedsPredication(BB) || NeedsBlends;
9310       });
9311 
9312   RecipeBuilder.collectScaledReductions(Range);
9313 
9314   auto *MiddleVPBB = Plan->getMiddleBlock();
9315   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9316   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9317     // Relevant instructions from basic block BB will be grouped into VPRecipe
9318     // ingredients and fill a new VPBasicBlock.
9319     if (VPBB != HeaderVPBB)
9320       VPBB->setName(BB->getName());
9321     Builder.setInsertPoint(VPBB);
9322 
9323     if (VPBB == HeaderVPBB)
9324       RecipeBuilder.createHeaderMask();
9325     else if (NeedsMasks)
9326       RecipeBuilder.createBlockInMask(BB);
9327 
9328     // Introduce each ingredient into VPlan.
9329     // TODO: Model and preserve debug intrinsics in VPlan.
9330     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9331       Instruction *Instr = &I;
9332       SmallVector<VPValue *, 4> Operands;
9333       auto *Phi = dyn_cast<PHINode>(Instr);
9334       if (Phi && Phi->getParent() == HeaderBB) {
9335         Operands.push_back(Plan->getOrAddLiveIn(
9336             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9337       } else {
9338         auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9339         Operands = {OpRange.begin(), OpRange.end()};
9340       }
9341 
9342       // The stores with invariant address inside the loop will be deleted, and
9343       // in the exit block, a uniform store recipe will be created for the final
9344       // invariant store of the reduction.
9345       StoreInst *SI;
9346       if ((SI = dyn_cast<StoreInst>(&I)) &&
9347           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9348         // Only create recipe for the final invariant store of the reduction.
9349         if (!Legal->isInvariantStoreOfReduction(SI))
9350           continue;
9351         auto *Recipe = new VPReplicateRecipe(
9352             SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9353             true /* IsUniform */);
9354         Recipe->insertBefore(*MiddleVPBB, MBIP);
9355         continue;
9356       }
9357 
9358       VPRecipeBase *Recipe =
9359           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9360       if (!Recipe)
9361         Recipe = RecipeBuilder.handleReplication(Instr, Range);
9362 
9363       RecipeBuilder.setRecipe(Instr, Recipe);
9364       if (isa<VPHeaderPHIRecipe>(Recipe)) {
9365         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9366         // the following cases, VPHeaderPHIRecipes may be created after non-phi
9367         // recipes and need to be moved to the phi section of HeaderVPBB:
9368         // * tail-folding (non-phi recipes computing the header mask are
9369         // introduced earlier than regular header phi recipes, and should appear
9370         // after them)
9371         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9372 
9373         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9374                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9375                "unexpected recipe needs moving");
9376         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9377       } else
9378         VPBB->appendRecipe(Recipe);
9379     }
9380 
9381     VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9382     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9383   }
9384 
9385   // After here, VPBB should not be used.
9386   VPBB = nullptr;
9387 
9388   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9389          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9390          "entry block must be set to a VPRegionBlock having a non-empty entry "
9391          "VPBasicBlock");
9392   RecipeBuilder.fixHeaderPhis();
9393 
9394   // Update wide induction increments to use the same step as the corresponding
9395   // wide induction. This enables detecting induction increments directly in
9396   // VPlan and removes redundant splats.
9397   for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9398     auto *IVInc = cast<Instruction>(
9399         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9400     if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9401       continue;
9402     VPWidenInductionRecipe *WideIV =
9403         cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9404     VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9405     R->setOperand(1, WideIV->getStepValue());
9406   }
9407 
9408   if (auto *UncountableExitingBlock =
9409           Legal->getUncountableEarlyExitingBlock()) {
9410     if (!VPlanTransforms::handleUncountableEarlyExit(
9411             *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock,
9412             RecipeBuilder)) {
9413       reportVectorizationFailure(
9414           "Some exit values in loop with uncountable exit not supported yet",
9415           "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9416       return nullptr;
9417     }
9418   }
9419   DenseMap<VPValue *, VPValue *> IVEndValues;
9420   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9421   SetVector<VPIRInstruction *> ExitUsersToFix =
9422       collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9423   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9424   addUsersInExitBlocks(*Plan, ExitUsersToFix);
9425 
9426   // ---------------------------------------------------------------------------
9427   // Transform initial VPlan: Apply previously taken decisions, in order, to
9428   // bring the VPlan to its final state.
9429   // ---------------------------------------------------------------------------
9430 
9431   // Adjust the recipes for any inloop reductions.
9432   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9433 
9434   // Interleave memory: for each Interleave Group we marked earlier as relevant
9435   // for this VPlan, replace the Recipes widening its memory instructions with a
9436   // single VPInterleaveRecipe at its insertion point.
9437   VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan,
9438                            InterleaveGroups, RecipeBuilder,
9439                            CM.isScalarEpilogueAllowed());
9440 
9441   for (ElementCount VF : Range)
9442     Plan->addVF(VF);
9443   Plan->setName("Initial VPlan");
9444 
9445   // Replace VPValues for known constant strides guaranteed by predicate scalar
9446   // evolution.
9447   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9448     auto *R = cast<VPRecipeBase>(&U);
9449     return R->getParent()->getParent() ||
9450            R->getParent() ==
9451                Plan->getVectorLoopRegion()->getSinglePredecessor();
9452   };
9453   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9454     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9455     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9456     // Only handle constant strides for now.
9457     if (!ScevStride)
9458       continue;
9459 
9460     auto *CI = Plan->getOrAddLiveIn(
9461         ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9462     if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9463       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9464 
9465     // The versioned value may not be used in the loop directly but through a
9466     // sext/zext. Add new live-ins in those cases.
9467     for (Value *U : StrideV->users()) {
9468       if (!isa<SExtInst, ZExtInst>(U))
9469         continue;
9470       VPValue *StrideVPV = Plan->getLiveIn(U);
9471       if (!StrideVPV)
9472         continue;
9473       unsigned BW = U->getType()->getScalarSizeInBits();
9474       APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9475                                  : ScevStride->getAPInt().zext(BW);
9476       VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9477       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9478     }
9479   }
9480 
9481   auto BlockNeedsPredication = [this](BasicBlock *BB) {
9482     return Legal->blockNeedsPredication(BB);
9483   };
9484   VPlanTransforms::runPass(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan,
9485                            BlockNeedsPredication);
9486 
9487   // Sink users of fixed-order recurrence past the recipe defining the previous
9488   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9489   if (!VPlanTransforms::runPass(VPlanTransforms::adjustFixedOrderRecurrences,
9490                                 *Plan, Builder))
9491     return nullptr;
9492 
9493   if (useActiveLaneMask(Style)) {
9494     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9495     // TailFoldingStyle is visible there.
9496     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9497     bool WithoutRuntimeCheck =
9498         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
9499     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9500                                        WithoutRuntimeCheck);
9501   }
9502   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
9503 
9504   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9505   return Plan;
9506 }
9507 
9508 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9509   // Outer loop handling: They may require CFG and instruction level
9510   // transformations before even evaluating whether vectorization is profitable.
9511   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9512   // the vectorization pipeline.
9513   assert(!OrigLoop->isInnermost());
9514   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9515 
9516   // Create new empty VPlan
9517   auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9518                                         true, false, OrigLoop);
9519 
9520   // Build hierarchical CFG
9521   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9522   HCFGBuilder.buildHierarchicalCFG();
9523 
9524   for (ElementCount VF : Range)
9525     Plan->addVF(VF);
9526 
9527   VPlanTransforms::VPInstructionsToVPRecipes(
9528       Plan,
9529       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9530       *PSE.getSE(), *TLI);
9531 
9532   // Tail folding is not supported for outer loops, so the induction increment
9533   // is guaranteed to not wrap.
9534   bool HasNUW = true;
9535   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9536                         DebugLoc());
9537 
9538   // Collect mapping of IR header phis to header phi recipes, to be used in
9539   // addScalarResumePhis.
9540   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9541                                 Builder);
9542   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9543     if (isa<VPCanonicalIVPHIRecipe>(&R))
9544       continue;
9545     auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9546     RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9547   }
9548   DenseMap<VPValue *, VPValue *> IVEndValues;
9549   // TODO: IVEndValues are not used yet in the native path, to optimize exit
9550   // values.
9551   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9552 
9553   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9554   return Plan;
9555 }
9556 
9557 // Adjust the recipes for reductions. For in-loop reductions the chain of
9558 // instructions leading from the loop exit instr to the phi need to be converted
9559 // to reductions, with one operand being vector and the other being the scalar
9560 // reduction chain. For other reductions, a select is introduced between the phi
9561 // and users outside the vector region when folding the tail.
9562 //
9563 // A ComputeReductionResult recipe is added to the middle block, also for
9564 // in-loop reductions which compute their result in-loop, because generating
9565 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9566 //
9567 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9568 // with a boolean reduction phi node to check if the condition is true in any
9569 // iteration. The final value is selected by the final ComputeReductionResult.
9570 void LoopVectorizationPlanner::adjustRecipesForReductions(
9571     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9572   using namespace VPlanPatternMatch;
9573   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9574   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9575   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9576   SmallVector<VPRecipeBase *> ToDelete;
9577 
9578   for (VPRecipeBase &R : Header->phis()) {
9579     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9580     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9581       continue;
9582 
9583     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9584     RecurKind Kind = RdxDesc.getRecurrenceKind();
9585     assert(
9586         !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9587         !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
9588         "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9589 
9590     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9591     SetVector<VPSingleDefRecipe *> Worklist;
9592     Worklist.insert(PhiR);
9593     for (unsigned I = 0; I != Worklist.size(); ++I) {
9594       VPSingleDefRecipe *Cur = Worklist[I];
9595       for (VPUser *U : Cur->users()) {
9596         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9597         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9598           assert((UserRecipe->getParent() == MiddleVPBB ||
9599                   UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9600                  "U must be either in the loop region, the middle block or the "
9601                  "scalar preheader.");
9602           continue;
9603         }
9604         Worklist.insert(UserRecipe);
9605       }
9606     }
9607 
9608     // Visit operation "Links" along the reduction chain top-down starting from
9609     // the phi until LoopExitValue. We keep track of the previous item
9610     // (PreviousLink) to tell which of the two operands of a Link will remain
9611     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9612     // the select instructions. Blend recipes of in-loop reduction phi's  will
9613     // get folded to their non-phi operand, as the reduction recipe handles the
9614     // condition directly.
9615     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9616     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9617       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9618 
9619       // Index of the first operand which holds a non-mask vector operand.
9620       unsigned IndexOfFirstOperand;
9621       // Recognize a call to the llvm.fmuladd intrinsic.
9622       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9623       VPValue *VecOp;
9624       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9625       if (IsFMulAdd) {
9626         assert(
9627             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9628             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9629         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9630                 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9631                CurrentLink->getOperand(2) == PreviousLink &&
9632                "expected a call where the previous link is the added operand");
9633 
9634         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9635         // need to create an fmul recipe (multiplying the first two operands of
9636         // the fmuladd together) to use as the vector operand for the fadd
9637         // reduction.
9638         VPInstruction *FMulRecipe = new VPInstruction(
9639             Instruction::FMul,
9640             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9641             CurrentLinkI->getFastMathFlags());
9642         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9643         VecOp = FMulRecipe;
9644       } else {
9645         auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9646         if (PhiR->isInLoop() && Blend) {
9647           assert(Blend->getNumIncomingValues() == 2 &&
9648                  "Blend must have 2 incoming values");
9649           if (Blend->getIncomingValue(0) == PhiR)
9650             Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9651           else {
9652             assert(Blend->getIncomingValue(1) == PhiR &&
9653                    "PhiR must be an operand of the blend");
9654             Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9655           }
9656           continue;
9657         }
9658 
9659         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9660           if (isa<VPWidenRecipe>(CurrentLink)) {
9661             assert(isa<CmpInst>(CurrentLinkI) &&
9662                    "need to have the compare of the select");
9663             continue;
9664           }
9665           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9666                  "must be a select recipe");
9667           IndexOfFirstOperand = 1;
9668         } else {
9669           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9670                  "Expected to replace a VPWidenSC");
9671           IndexOfFirstOperand = 0;
9672         }
9673         // Note that for non-commutable operands (cmp-selects), the semantics of
9674         // the cmp-select are captured in the recurrence kind.
9675         unsigned VecOpId =
9676             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9677                 ? IndexOfFirstOperand + 1
9678                 : IndexOfFirstOperand;
9679         VecOp = CurrentLink->getOperand(VecOpId);
9680         assert(VecOp != PreviousLink &&
9681                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9682                                        (VecOpId - IndexOfFirstOperand)) ==
9683                    PreviousLink &&
9684                "PreviousLink must be the operand other than VecOp");
9685       }
9686 
9687       BasicBlock *BB = CurrentLinkI->getParent();
9688       VPValue *CondOp = nullptr;
9689       if (CM.blockNeedsPredicationForAnyReason(BB))
9690         CondOp = RecipeBuilder.getBlockInMask(BB);
9691 
9692       auto *RedRecipe = new VPReductionRecipe(
9693           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9694           CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9695       // Append the recipe to the end of the VPBasicBlock because we need to
9696       // ensure that it comes after all of it's inputs, including CondOp.
9697       // Delete CurrentLink as it will be invalid if its operand is replaced
9698       // with a reduction defined at the bottom of the block in the next link.
9699       LinkVPBB->appendRecipe(RedRecipe);
9700       CurrentLink->replaceAllUsesWith(RedRecipe);
9701       ToDelete.push_back(CurrentLink);
9702       PreviousLink = RedRecipe;
9703     }
9704   }
9705   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9706   Builder.setInsertPoint(&*LatchVPBB->begin());
9707   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9708   for (VPRecipeBase &R :
9709        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9710     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9711     if (!PhiR)
9712       continue;
9713 
9714     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9715     // If tail is folded by masking, introduce selects between the phi
9716     // and the users outside the vector region of each reduction, at the
9717     // beginning of the dedicated latch block.
9718     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9719     auto *NewExitingVPV = PhiR->getBackedgeValue();
9720     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9721       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9722       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9723              "reduction recipe must be defined before latch");
9724       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9725       std::optional<FastMathFlags> FMFs =
9726           PhiTy->isFloatingPointTy()
9727               ? std::make_optional(RdxDesc.getFastMathFlags())
9728               : std::nullopt;
9729       NewExitingVPV =
9730           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9731       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9732         return isa<VPInstruction>(&U) &&
9733                cast<VPInstruction>(&U)->getOpcode() ==
9734                    VPInstruction::ComputeReductionResult;
9735       });
9736       if (CM.usePredicatedReductionSelect(
9737               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9738         PhiR->setOperand(1, NewExitingVPV);
9739     }
9740 
9741     // If the vector reduction can be performed in a smaller type, we truncate
9742     // then extend the loop exit value to enable InstCombine to evaluate the
9743     // entire expression in the smaller type.
9744     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9745     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9746         !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9747             RdxDesc.getRecurrenceKind())) {
9748       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9749       Type *RdxTy = RdxDesc.getRecurrenceType();
9750       auto *Trunc =
9751           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9752       auto *Extnd =
9753           RdxDesc.isSigned()
9754               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9755               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9756 
9757       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9758       Extnd->insertAfter(Trunc);
9759       if (PhiR->getOperand(1) == NewExitingVPV)
9760         PhiR->setOperand(1, Extnd->getVPSingleValue());
9761       NewExitingVPV = Extnd;
9762     }
9763 
9764     // We want code in the middle block to appear to execute on the location of
9765     // the scalar loop's latch terminator because: (a) it is all compiler
9766     // generated, (b) these instructions are always executed after evaluating
9767     // the latch conditional branch, and (c) other passes may add new
9768     // predecessors which terminate on this line. This is the easiest way to
9769     // ensure we don't accidentally cause an extra step back into the loop while
9770     // debugging.
9771     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9772 
9773     // TODO: At the moment ComputeReductionResult also drives creation of the
9774     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9775     // even for in-loop reductions, until the reduction resume value handling is
9776     // also modeled in VPlan.
9777     auto *FinalReductionResult = new VPInstruction(
9778         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9779     // Update all users outside the vector region.
9780     OrigExitingVPV->replaceUsesWithIf(
9781         FinalReductionResult, [](VPUser &User, unsigned) {
9782           auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9783           return Parent && !Parent->getParent();
9784         });
9785     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9786 
9787     // Adjust AnyOf reductions; replace the reduction phi for the selected value
9788     // with a boolean reduction phi node to check if the condition is true in
9789     // any iteration. The final value is selected by the final
9790     // ComputeReductionResult.
9791     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9792             RdxDesc.getRecurrenceKind())) {
9793       auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9794         return isa<VPWidenSelectRecipe>(U) ||
9795                (isa<VPReplicateRecipe>(U) &&
9796                 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9797                     Instruction::Select);
9798       }));
9799       VPValue *Cmp = Select->getOperand(0);
9800       // If the compare is checking the reduction PHI node, adjust it to check
9801       // the start value.
9802       if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9803         for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9804           if (CmpR->getOperand(I) == PhiR)
9805             CmpR->setOperand(I, PhiR->getStartValue());
9806       }
9807       VPBuilder::InsertPointGuard Guard(Builder);
9808       Builder.setInsertPoint(Select);
9809 
9810       // If the true value of the select is the reduction phi, the new value is
9811       // selected if the negated condition is true in any iteration.
9812       if (Select->getOperand(1) == PhiR)
9813         Cmp = Builder.createNot(Cmp);
9814       VPValue *Or = Builder.createOr(PhiR, Cmp);
9815       Select->getVPSingleValue()->replaceAllUsesWith(Or);
9816       // Delete Select now that it has invalid types.
9817       ToDelete.push_back(Select);
9818 
9819       // Convert the reduction phi to operate on bools.
9820       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9821                               OrigLoop->getHeader()->getContext())));
9822       continue;
9823     }
9824 
9825     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
9826             RdxDesc.getRecurrenceKind())) {
9827       // Adjust the start value for FindLastIV recurrences to use the sentinel
9828       // value after generating the ResumePhi recipe, which uses the original
9829       // start value.
9830       PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9831     }
9832   }
9833   for (VPRecipeBase *R : ToDelete)
9834     R->eraseFromParent();
9835 
9836   VPlanTransforms::runPass(VPlanTransforms::clearReductionWrapFlags, *Plan);
9837 }
9838 
9839 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9840   assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9841 
9842   // Fast-math-flags propagate from the original induction instruction.
9843   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9844   if (FPBinOp)
9845     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9846 
9847   Value *Step = State.get(getStepValue(), VPLane(0));
9848   Value *Index = State.get(getOperand(1), VPLane(0));
9849   Value *DerivedIV = emitTransformedIndex(
9850       State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9851       cast_if_present<BinaryOperator>(FPBinOp));
9852   DerivedIV->setName(Name);
9853   // If index is the vector trip count, the concrete value will only be set in
9854   // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9855   // TODO: Remove the special case for the vector trip count once it is computed
9856   // in VPlan and can be used during VPlan simplification.
9857   assert((DerivedIV != Index ||
9858           getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9859          "IV didn't need transforming?");
9860   State.set(this, DerivedIV, VPLane(0));
9861 }
9862 
9863 void VPReplicateRecipe::execute(VPTransformState &State) {
9864   Instruction *UI = getUnderlyingInstr();
9865   if (State.Lane) { // Generate a single instance.
9866     assert((State.VF.isScalar() || !isUniform()) &&
9867            "uniform recipe shouldn't be predicated");
9868     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9869     State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
9870     // Insert scalar instance packing it into a vector.
9871     if (State.VF.isVector() && shouldPack()) {
9872       // If we're constructing lane 0, initialize to start from poison.
9873       if (State.Lane->isFirstLane()) {
9874         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9875         Value *Poison = PoisonValue::get(
9876             VectorType::get(UI->getType(), State.VF));
9877         State.set(this, Poison);
9878       }
9879       State.packScalarIntoVectorValue(this, *State.Lane);
9880     }
9881     return;
9882   }
9883 
9884   if (IsUniform) {
9885     // Uniform within VL means we need to generate lane 0.
9886     State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
9887     return;
9888   }
9889 
9890   // A store of a loop varying value to a uniform address only needs the last
9891   // copy of the store.
9892   if (isa<StoreInst>(UI) &&
9893       vputils::isUniformAfterVectorization(getOperand(1))) {
9894     auto Lane = VPLane::getLastLaneForVF(State.VF);
9895     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9896     return;
9897   }
9898 
9899   // Generate scalar instances for all VF lanes.
9900   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9901   const unsigned EndLane = State.VF.getKnownMinValue();
9902   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9903     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9904 }
9905 
9906 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9907 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9908 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9909 // for predication.
9910 static ScalarEpilogueLowering getScalarEpilogueLowering(
9911     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9912     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9913     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9914   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9915   // don't look at hints or options, and don't request a scalar epilogue.
9916   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9917   // LoopAccessInfo (due to code dependency and not being able to reliably get
9918   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9919   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9920   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9921   // back to the old way and vectorize with versioning when forced. See D81345.)
9922   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9923                                                       PGSOQueryType::IRPass) &&
9924                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9925     return CM_ScalarEpilogueNotAllowedOptSize;
9926 
9927   // 2) If set, obey the directives
9928   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9929     switch (PreferPredicateOverEpilogue) {
9930     case PreferPredicateTy::ScalarEpilogue:
9931       return CM_ScalarEpilogueAllowed;
9932     case PreferPredicateTy::PredicateElseScalarEpilogue:
9933       return CM_ScalarEpilogueNotNeededUsePredicate;
9934     case PreferPredicateTy::PredicateOrDontVectorize:
9935       return CM_ScalarEpilogueNotAllowedUsePredicate;
9936     };
9937   }
9938 
9939   // 3) If set, obey the hints
9940   switch (Hints.getPredicate()) {
9941   case LoopVectorizeHints::FK_Enabled:
9942     return CM_ScalarEpilogueNotNeededUsePredicate;
9943   case LoopVectorizeHints::FK_Disabled:
9944     return CM_ScalarEpilogueAllowed;
9945   };
9946 
9947   // 4) if the TTI hook indicates this is profitable, request predication.
9948   TailFoldingInfo TFI(TLI, &LVL, IAI);
9949   if (TTI->preferPredicateOverEpilogue(&TFI))
9950     return CM_ScalarEpilogueNotNeededUsePredicate;
9951 
9952   return CM_ScalarEpilogueAllowed;
9953 }
9954 
9955 // Process the loop in the VPlan-native vectorization path. This path builds
9956 // VPlan upfront in the vectorization pipeline, which allows to apply
9957 // VPlan-to-VPlan transformations from the very beginning without modifying the
9958 // input LLVM IR.
9959 static bool processLoopInVPlanNativePath(
9960     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9961     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9962     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9963     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9964     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9965     LoopVectorizationRequirements &Requirements) {
9966 
9967   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9968     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9969     return false;
9970   }
9971   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9972   Function *F = L->getHeader()->getParent();
9973   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9974 
9975   ScalarEpilogueLowering SEL =
9976       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9977 
9978   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9979                                 &Hints, IAI);
9980   // Use the planner for outer loop vectorization.
9981   // TODO: CM is not used at this point inside the planner. Turn CM into an
9982   // optional argument if we don't need it in the future.
9983   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9984                                ORE);
9985 
9986   // Get user vectorization factor.
9987   ElementCount UserVF = Hints.getWidth();
9988 
9989   CM.collectElementTypesForWidening();
9990 
9991   // Plan how to best vectorize, return the best VF and its cost.
9992   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9993 
9994   // If we are stress testing VPlan builds, do not attempt to generate vector
9995   // code. Masked vector code generation support will follow soon.
9996   // Also, do not attempt to vectorize if no vector code will be produced.
9997   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9998     return false;
9999 
10000   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10001 
10002   {
10003     bool AddBranchWeights =
10004         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10005     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10006                              AddBranchWeights, CM.CostKind);
10007     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10008                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10009     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10010                       << L->getHeader()->getParent()->getName() << "\"\n");
10011     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10012   }
10013 
10014   reportVectorization(ORE, L, VF, 1);
10015 
10016   // Mark the loop as already vectorized to avoid vectorizing again.
10017   Hints.setAlreadyVectorized();
10018   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10019   return true;
10020 }
10021 
10022 // Emit a remark if there are stores to floats that required a floating point
10023 // extension. If the vectorized loop was generated with floating point there
10024 // will be a performance penalty from the conversion overhead and the change in
10025 // the vector width.
10026 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10027   SmallVector<Instruction *, 4> Worklist;
10028   for (BasicBlock *BB : L->getBlocks()) {
10029     for (Instruction &Inst : *BB) {
10030       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10031         if (S->getValueOperand()->getType()->isFloatTy())
10032           Worklist.push_back(S);
10033       }
10034     }
10035   }
10036 
10037   // Traverse the floating point stores upwards searching, for floating point
10038   // conversions.
10039   SmallPtrSet<const Instruction *, 4> Visited;
10040   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10041   while (!Worklist.empty()) {
10042     auto *I = Worklist.pop_back_val();
10043     if (!L->contains(I))
10044       continue;
10045     if (!Visited.insert(I).second)
10046       continue;
10047 
10048     // Emit a remark if the floating point store required a floating
10049     // point conversion.
10050     // TODO: More work could be done to identify the root cause such as a
10051     // constant or a function return type and point the user to it.
10052     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10053       ORE->emit([&]() {
10054         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10055                                           I->getDebugLoc(), L->getHeader())
10056                << "floating point conversion changes vector width. "
10057                << "Mixed floating point precision requires an up/down "
10058                << "cast that will negatively impact performance.";
10059       });
10060 
10061     for (Use &Op : I->operands())
10062       if (auto *OpI = dyn_cast<Instruction>(Op))
10063         Worklist.push_back(OpI);
10064   }
10065 }
10066 
10067 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10068                                        VectorizationFactor &VF, Loop *L,
10069                                        const TargetTransformInfo &TTI,
10070                                        PredicatedScalarEvolution &PSE,
10071                                        ScalarEpilogueLowering SEL) {
10072   InstructionCost CheckCost = Checks.getCost();
10073   if (!CheckCost.isValid())
10074     return false;
10075 
10076   // When interleaving only scalar and vector cost will be equal, which in turn
10077   // would lead to a divide by 0. Fall back to hard threshold.
10078   if (VF.Width.isScalar()) {
10079     if (CheckCost > VectorizeMemoryCheckThreshold) {
10080       LLVM_DEBUG(
10081           dbgs()
10082           << "LV: Interleaving only is not profitable due to runtime checks\n");
10083       return false;
10084     }
10085     return true;
10086   }
10087 
10088   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10089   uint64_t ScalarC = *VF.ScalarCost.getValue();
10090   if (ScalarC == 0)
10091     return true;
10092 
10093   // First, compute the minimum iteration count required so that the vector
10094   // loop outperforms the scalar loop.
10095   //  The total cost of the scalar loop is
10096   //   ScalarC * TC
10097   //  where
10098   //  * TC is the actual trip count of the loop.
10099   //  * ScalarC is the cost of a single scalar iteration.
10100   //
10101   //  The total cost of the vector loop is
10102   //    RtC + VecC * (TC / VF) + EpiC
10103   //  where
10104   //  * RtC is the cost of the generated runtime checks
10105   //  * VecC is the cost of a single vector iteration.
10106   //  * TC is the actual trip count of the loop
10107   //  * VF is the vectorization factor
10108   //  * EpiCost is the cost of the generated epilogue, including the cost
10109   //    of the remaining scalar operations.
10110   //
10111   // Vectorization is profitable once the total vector cost is less than the
10112   // total scalar cost:
10113   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10114   //
10115   // Now we can compute the minimum required trip count TC as
10116   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10117   //
10118   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10119   // the computations are performed on doubles, not integers and the result
10120   // is rounded up, hence we get an upper estimate of the TC.
10121   unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10122   uint64_t RtC = *CheckCost.getValue();
10123   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10124   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10125 
10126   // Second, compute a minimum iteration count so that the cost of the
10127   // runtime checks is only a fraction of the total scalar loop cost. This
10128   // adds a loop-dependent bound on the overhead incurred if the runtime
10129   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10130   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10131   // cost, compute
10132   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10133   uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10134 
10135   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10136   // epilogue is allowed, choose the next closest multiple of VF. This should
10137   // partly compensate for ignoring the epilogue cost.
10138   uint64_t MinTC = std::max(MinTC1, MinTC2);
10139   if (SEL == CM_ScalarEpilogueAllowed)
10140     MinTC = alignTo(MinTC, IntVF);
10141   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
10142 
10143   LLVM_DEBUG(
10144       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10145              << VF.MinProfitableTripCount << "\n");
10146 
10147   // Skip vectorization if the expected trip count is less than the minimum
10148   // required trip count.
10149   if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10150     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10151                                 VF.MinProfitableTripCount)) {
10152       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10153                            "trip count < minimum profitable VF ("
10154                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10155                         << ")\n");
10156 
10157       return false;
10158     }
10159   }
10160   return true;
10161 }
10162 
10163 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10164     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10165                                !EnableLoopInterleaving),
10166       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10167                               !EnableLoopVectorization) {}
10168 
10169 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10170 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10171 /// don't have a corresponding wide induction in \p EpiPlan.
10172 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10173   // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10174   // will need their resume-values computed in the main vector loop. Others
10175   // can be removed from the main VPlan.
10176   SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10177   for (VPRecipeBase &R :
10178        EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
10179     if (isa<VPCanonicalIVPHIRecipe>(&R))
10180       continue;
10181     EpiWidenedPhis.insert(
10182         cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10183   }
10184   for (VPRecipeBase &R : make_early_inc_range(
10185            *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10186     auto *VPIRInst = cast<VPIRInstruction>(&R);
10187     auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10188     if (!IRI)
10189       break;
10190     if (EpiWidenedPhis.contains(IRI))
10191       continue;
10192     // There is no corresponding wide induction in the epilogue plan that would
10193     // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10194     // together with the corresponding ResumePhi. The resume values for the
10195     // scalar loop will be created during execution of EpiPlan.
10196     VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10197     VPIRInst->eraseFromParent();
10198     ResumePhi->eraseFromParent();
10199   }
10200   VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan);
10201 
10202   using namespace VPlanPatternMatch;
10203   VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10204   VPValue *VectorTC = &MainPlan.getVectorTripCount();
10205   // If there is a suitable resume value for the canonical induction in the
10206   // scalar (which will become vector) epilogue loop we are done. Otherwise
10207   // create it below.
10208   if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10209         return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10210                              m_Specific(VectorTC), m_SpecificInt(0)));
10211       }))
10212     return;
10213   VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10214   ScalarPHBuilder.createNaryOp(
10215       VPInstruction::ResumePhi,
10216       {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10217       "vec.epilog.resume.val");
10218 }
10219 
10220 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10221 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10222 static void
10223 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
10224                                  const SCEV2ValueTy &ExpandedSCEVs,
10225                                  const EpilogueLoopVectorizationInfo &EPI) {
10226   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10227   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10228   Header->setName("vec.epilog.vector.body");
10229 
10230   // Re-use the trip count and steps expanded for the main loop, as
10231   // skeleton creation needs it as a value that dominates both the scalar
10232   // and vector epilogue loops
10233   // TODO: This is a workaround needed for epilogue vectorization and it
10234   // should be removed once induction resume value creation is done
10235   // directly in VPlan.
10236   for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10237     auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10238     if (!ExpandR)
10239       continue;
10240     auto *ExpandedVal =
10241         Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10242     ExpandR->replaceAllUsesWith(ExpandedVal);
10243     if (Plan.getTripCount() == ExpandR)
10244       Plan.resetTripCount(ExpandedVal);
10245     ExpandR->eraseFromParent();
10246   }
10247 
10248   // Ensure that the start values for all header phi recipes are updated before
10249   // vectorizing the epilogue loop.
10250   for (VPRecipeBase &R : Header->phis()) {
10251     if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10252       // When vectorizing the epilogue loop, the canonical induction start
10253       // value needs to be changed from zero to the value after the main
10254       // vector loop. Find the resume value created during execution of the main
10255       // VPlan.
10256       // FIXME: Improve modeling for canonical IV start values in the epilogue
10257       // loop.
10258       BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10259           predecessors(L->getLoopPreheader()),
10260           [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10261             if (BB != EPI.MainLoopIterationCountCheck &&
10262                 BB != EPI.EpilogueIterationCountCheck &&
10263                 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10264               return BB;
10265             return nullptr;
10266           });
10267       using namespace llvm::PatternMatch;
10268       Type *IdxTy = IV->getScalarType();
10269       PHINode *EPResumeVal = find_singleton<PHINode>(
10270           L->getLoopPreheader()->phis(),
10271           [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10272             if (P.getType() == IdxTy &&
10273                 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10274                 match(
10275                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10276                     m_SpecificInt(0)))
10277               return &P;
10278             return nullptr;
10279           });
10280       assert(EPResumeVal && "must have a resume value for the canonical IV");
10281       VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10282       assert(all_of(IV->users(),
10283                     [](const VPUser *U) {
10284                       return isa<VPScalarIVStepsRecipe>(U) ||
10285                              isa<VPScalarCastRecipe>(U) ||
10286                              isa<VPDerivedIVRecipe>(U) ||
10287                              cast<VPInstruction>(U)->getOpcode() ==
10288                                  Instruction::Add;
10289                     }) &&
10290              "the canonical IV should only be used by its increment or "
10291              "ScalarIVSteps when resetting the start value");
10292       IV->setOperand(0, VPV);
10293       continue;
10294     }
10295 
10296     Value *ResumeV = nullptr;
10297     // TODO: Move setting of resume values to prepareToExecute.
10298     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10299       ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10300                     ->getIncomingValueForBlock(L->getLoopPreheader());
10301       const RecurrenceDescriptor &RdxDesc =
10302           ReductionPhi->getRecurrenceDescriptor();
10303       RecurKind RK = RdxDesc.getRecurrenceKind();
10304       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
10305         // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10306         // start value; compare the final value from the main vector loop
10307         // to the start value.
10308         BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
10309         IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
10310         ResumeV =
10311             Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10312       } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
10313         // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10314         // to the resume value. The resume value is adjusted to the sentinel
10315         // value when the final value from the main vector loop equals the start
10316         // value. This ensures correctness when the start value might not be
10317         // less than the minimum value of a monotonically increasing induction
10318         // variable.
10319         BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
10320         IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
10321         Value *Cmp =
10322             Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10323         ResumeV =
10324             Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10325       }
10326     } else {
10327       // Retrieve the induction resume values for wide inductions from
10328       // their original phi nodes in the scalar loop.
10329       PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10330       // Hook up to the PHINode generated by a ResumePhi recipe of main
10331       // loop VPlan, which feeds the scalar loop.
10332       ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10333     }
10334     assert(ResumeV && "Must have a resume value");
10335     VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10336     cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10337   }
10338 }
10339 
10340 bool LoopVectorizePass::processLoop(Loop *L) {
10341   assert((EnableVPlanNativePath || L->isInnermost()) &&
10342          "VPlan-native path is not enabled. Only process inner loops.");
10343 
10344   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10345                     << L->getHeader()->getParent()->getName() << "' from "
10346                     << L->getLocStr() << "\n");
10347 
10348   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10349 
10350   LLVM_DEBUG(
10351       dbgs() << "LV: Loop hints:"
10352              << " force="
10353              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10354                      ? "disabled"
10355                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10356                             ? "enabled"
10357                             : "?"))
10358              << " width=" << Hints.getWidth()
10359              << " interleave=" << Hints.getInterleave() << "\n");
10360 
10361   // Function containing loop
10362   Function *F = L->getHeader()->getParent();
10363 
10364   // Looking at the diagnostic output is the only way to determine if a loop
10365   // was vectorized (other than looking at the IR or machine code), so it
10366   // is important to generate an optimization remark for each loop. Most of
10367   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10368   // generated as OptimizationRemark and OptimizationRemarkMissed are
10369   // less verbose reporting vectorized loops and unvectorized loops that may
10370   // benefit from vectorization, respectively.
10371 
10372   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10373     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10374     return false;
10375   }
10376 
10377   PredicatedScalarEvolution PSE(*SE, *L);
10378 
10379   // Check if it is legal to vectorize the loop.
10380   LoopVectorizationRequirements Requirements;
10381   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10382                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10383   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10384     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10385     Hints.emitRemarkWithHints();
10386     return false;
10387   }
10388 
10389   if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10390     reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10391                                "early exit is not enabled",
10392                                "UncountableEarlyExitLoopsDisabled", ORE, L);
10393     return false;
10394   }
10395 
10396   if (LVL.hasStructVectorCall()) {
10397     reportVectorizationFailure("Auto-vectorization of calls that return struct "
10398                                "types is not yet supported",
10399                                "StructCallVectorizationUnsupported", ORE, L);
10400     return false;
10401   }
10402 
10403   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10404   // here. They may require CFG and instruction level transformations before
10405   // even evaluating whether vectorization is profitable. Since we cannot modify
10406   // the incoming IR, we need to build VPlan upfront in the vectorization
10407   // pipeline.
10408   if (!L->isInnermost())
10409     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10410                                         ORE, BFI, PSI, Hints, Requirements);
10411 
10412   assert(L->isInnermost() && "Inner loop expected.");
10413 
10414   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10415   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10416 
10417   // If an override option has been passed in for interleaved accesses, use it.
10418   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10419     UseInterleaved = EnableInterleavedMemAccesses;
10420 
10421   // Analyze interleaved memory accesses.
10422   if (UseInterleaved)
10423     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10424 
10425   if (LVL.hasUncountableEarlyExit()) {
10426     BasicBlock *LoopLatch = L->getLoopLatch();
10427     if (IAI.requiresScalarEpilogue() ||
10428         any_of(LVL.getCountableExitingBlocks(),
10429                [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10430       reportVectorizationFailure("Auto-vectorization of early exit loops "
10431                                  "requiring a scalar epilogue is unsupported",
10432                                  "UncountableEarlyExitUnsupported", ORE, L);
10433       return false;
10434     }
10435   }
10436 
10437   // Check the function attributes and profiles to find out if this function
10438   // should be optimized for size.
10439   ScalarEpilogueLowering SEL =
10440       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10441 
10442   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10443   // count by optimizing for size, to minimize overheads.
10444   auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10445   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10446     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10447                       << "This loop is worth vectorizing only if no scalar "
10448                       << "iteration overheads are incurred.");
10449     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10450       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10451     else {
10452       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10453         LLVM_DEBUG(dbgs() << "\n");
10454         // Predicate tail-folded loops are efficient even when the loop
10455         // iteration count is low. However, setting the epilogue policy to
10456         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10457         // with runtime checks. It's more effective to let
10458         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10459         // for the loop.
10460         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10461           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10462       } else {
10463         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10464                              "small to consider vectorizing.\n");
10465         reportVectorizationFailure(
10466             "The trip count is below the minial threshold value.",
10467             "loop trip count is too low, avoiding vectorization",
10468             "LowTripCount", ORE, L);
10469         Hints.emitRemarkWithHints();
10470         return false;
10471       }
10472     }
10473   }
10474 
10475   // Check the function attributes to see if implicit floats or vectors are
10476   // allowed.
10477   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10478     reportVectorizationFailure(
10479         "Can't vectorize when the NoImplicitFloat attribute is used",
10480         "loop not vectorized due to NoImplicitFloat attribute",
10481         "NoImplicitFloat", ORE, L);
10482     Hints.emitRemarkWithHints();
10483     return false;
10484   }
10485 
10486   // Check if the target supports potentially unsafe FP vectorization.
10487   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10488   // for the target we're vectorizing for, to make sure none of the
10489   // additional fp-math flags can help.
10490   if (Hints.isPotentiallyUnsafe() &&
10491       TTI->isFPVectorizationPotentiallyUnsafe()) {
10492     reportVectorizationFailure(
10493         "Potentially unsafe FP op prevents vectorization",
10494         "loop not vectorized due to unsafe FP support.",
10495         "UnsafeFP", ORE, L);
10496     Hints.emitRemarkWithHints();
10497     return false;
10498   }
10499 
10500   bool AllowOrderedReductions;
10501   // If the flag is set, use that instead and override the TTI behaviour.
10502   if (ForceOrderedReductions.getNumOccurrences() > 0)
10503     AllowOrderedReductions = ForceOrderedReductions;
10504   else
10505     AllowOrderedReductions = TTI->enableOrderedReductions();
10506   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10507     ORE->emit([&]() {
10508       auto *ExactFPMathInst = Requirements.getExactFPInst();
10509       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10510                                                  ExactFPMathInst->getDebugLoc(),
10511                                                  ExactFPMathInst->getParent())
10512              << "loop not vectorized: cannot prove it is safe to reorder "
10513                 "floating-point operations";
10514     });
10515     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10516                          "reorder floating-point operations\n");
10517     Hints.emitRemarkWithHints();
10518     return false;
10519   }
10520 
10521   // Use the cost model.
10522   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10523                                 F, &Hints, IAI);
10524   // Use the planner for vectorization.
10525   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10526                                ORE);
10527 
10528   // Get user vectorization factor and interleave count.
10529   ElementCount UserVF = Hints.getWidth();
10530   unsigned UserIC = Hints.getInterleave();
10531 
10532   // Plan how to best vectorize.
10533   LVP.plan(UserVF, UserIC);
10534   VectorizationFactor VF = LVP.computeBestVF();
10535   unsigned IC = 1;
10536 
10537   if (ORE->allowExtraAnalysis(LV_NAME))
10538     LVP.emitInvalidCostRemarks(ORE);
10539 
10540   bool AddBranchWeights =
10541       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10542   GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10543                            AddBranchWeights, CM.CostKind);
10544   if (LVP.hasPlanWithVF(VF.Width)) {
10545     // Select the interleave count.
10546     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10547 
10548     unsigned SelectedIC = std::max(IC, UserIC);
10549     //  Optimistically generate runtime checks if they are needed. Drop them if
10550     //  they turn out to not be profitable.
10551     if (VF.Width.isVector() || SelectedIC > 1)
10552       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10553 
10554     // Check if it is profitable to vectorize with runtime checks.
10555     bool ForceVectorization =
10556         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10557     if (!ForceVectorization &&
10558         !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10559       ORE->emit([&]() {
10560         return OptimizationRemarkAnalysisAliasing(
10561                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10562                    L->getHeader())
10563                << "loop not vectorized: cannot prove it is safe to reorder "
10564                   "memory operations";
10565       });
10566       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10567       Hints.emitRemarkWithHints();
10568       return false;
10569     }
10570   }
10571 
10572   // Identify the diagnostic messages that should be produced.
10573   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10574   bool VectorizeLoop = true, InterleaveLoop = true;
10575   if (VF.Width.isScalar()) {
10576     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10577     VecDiagMsg = std::make_pair(
10578         "VectorizationNotBeneficial",
10579         "the cost-model indicates that vectorization is not beneficial");
10580     VectorizeLoop = false;
10581   }
10582 
10583   if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10584     // Tell the user interleaving was avoided up-front, despite being explicitly
10585     // requested.
10586     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10587                          "interleaving should be avoided up front\n");
10588     IntDiagMsg = std::make_pair(
10589         "InterleavingAvoided",
10590         "Ignoring UserIC, because interleaving was avoided up front");
10591     InterleaveLoop = false;
10592   } else if (IC == 1 && UserIC <= 1) {
10593     // Tell the user interleaving is not beneficial.
10594     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10595     IntDiagMsg = std::make_pair(
10596         "InterleavingNotBeneficial",
10597         "the cost-model indicates that interleaving is not beneficial");
10598     InterleaveLoop = false;
10599     if (UserIC == 1) {
10600       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10601       IntDiagMsg.second +=
10602           " and is explicitly disabled or interleave count is set to 1";
10603     }
10604   } else if (IC > 1 && UserIC == 1) {
10605     // Tell the user interleaving is beneficial, but it explicitly disabled.
10606     LLVM_DEBUG(
10607         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10608     IntDiagMsg = std::make_pair(
10609         "InterleavingBeneficialButDisabled",
10610         "the cost-model indicates that interleaving is beneficial "
10611         "but is explicitly disabled or interleave count is set to 1");
10612     InterleaveLoop = false;
10613   }
10614 
10615   // If there is a histogram in the loop, do not just interleave without
10616   // vectorizing. The order of operations will be incorrect without the
10617   // histogram intrinsics, which are only used for recipes with VF > 1.
10618   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10619     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10620                       << "to histogram operations.\n");
10621     IntDiagMsg = std::make_pair(
10622         "HistogramPreventsScalarInterleaving",
10623         "Unable to interleave without vectorization due to constraints on "
10624         "the order of histogram operations");
10625     InterleaveLoop = false;
10626   }
10627 
10628   // Override IC if user provided an interleave count.
10629   IC = UserIC > 0 ? UserIC : IC;
10630 
10631   // Emit diagnostic messages, if any.
10632   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10633   if (!VectorizeLoop && !InterleaveLoop) {
10634     // Do not vectorize or interleaving the loop.
10635     ORE->emit([&]() {
10636       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10637                                       L->getStartLoc(), L->getHeader())
10638              << VecDiagMsg.second;
10639     });
10640     ORE->emit([&]() {
10641       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10642                                       L->getStartLoc(), L->getHeader())
10643              << IntDiagMsg.second;
10644     });
10645     return false;
10646   }
10647 
10648   if (!VectorizeLoop && InterleaveLoop) {
10649     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10650     ORE->emit([&]() {
10651       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10652                                         L->getStartLoc(), L->getHeader())
10653              << VecDiagMsg.second;
10654     });
10655   } else if (VectorizeLoop && !InterleaveLoop) {
10656     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10657                       << ") in " << L->getLocStr() << '\n');
10658     ORE->emit([&]() {
10659       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10660                                         L->getStartLoc(), L->getHeader())
10661              << IntDiagMsg.second;
10662     });
10663   } else if (VectorizeLoop && InterleaveLoop) {
10664     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10665                       << ") in " << L->getLocStr() << '\n');
10666     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10667   }
10668 
10669   bool DisableRuntimeUnroll = false;
10670   MDNode *OrigLoopID = L->getLoopID();
10671   {
10672     using namespace ore;
10673     if (!VectorizeLoop) {
10674       assert(IC > 1 && "interleave count should not be 1 or 0");
10675       // If we decided that it is not legal to vectorize the loop, then
10676       // interleave it.
10677       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10678       InnerLoopVectorizer Unroller(
10679           L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10680           ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10681 
10682       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10683 
10684       ORE->emit([&]() {
10685         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10686                                   L->getHeader())
10687                << "interleaved loop (interleaved count: "
10688                << NV("InterleaveCount", IC) << ")";
10689       });
10690     } else {
10691       // If we decided that it is *legal* to vectorize the loop, then do it.
10692 
10693       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10694       // Consider vectorizing the epilogue too if it's profitable.
10695       VectorizationFactor EpilogueVF =
10696           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10697       if (EpilogueVF.Width.isVector()) {
10698         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10699 
10700         // The first pass vectorizes the main loop and creates a scalar epilogue
10701         // to be vectorized by executing the plan (potentially with a different
10702         // factor) again shortly afterwards.
10703         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10704         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10705         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10706                                           BestEpiPlan);
10707         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10708                                            EPI, &LVL, &CM, BFI, PSI, Checks,
10709                                            *BestMainPlan);
10710         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10711                                              *BestMainPlan, MainILV, DT, false);
10712         ++LoopsVectorized;
10713 
10714         // Second pass vectorizes the epilogue and adjusts the control flow
10715         // edges from the first pass.
10716         EPI.MainLoopVF = EPI.EpilogueVF;
10717         EPI.MainLoopUF = EPI.EpilogueUF;
10718         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10719                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10720                                                  Checks, BestEpiPlan);
10721         EpilogILV.setTripCount(MainILV.getTripCount());
10722         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10723 
10724         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10725                         DT, true, &ExpandedSCEVs);
10726         ++LoopsEpilogueVectorized;
10727 
10728         if (!MainILV.areSafetyChecksAdded())
10729           DisableRuntimeUnroll = true;
10730       } else {
10731         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10732                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10733                                PSI, Checks, BestPlan);
10734         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10735         ++LoopsVectorized;
10736 
10737         // Add metadata to disable runtime unrolling a scalar loop when there
10738         // are no runtime checks about strides and memory. A scalar loop that is
10739         // rarely used is not worth unrolling.
10740         if (!LB.areSafetyChecksAdded())
10741           DisableRuntimeUnroll = true;
10742       }
10743       // Report the vectorization decision.
10744       reportVectorization(ORE, L, VF, IC);
10745     }
10746 
10747     if (ORE->allowExtraAnalysis(LV_NAME))
10748       checkMixedPrecision(L, ORE);
10749   }
10750 
10751   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10752          "DT not preserved correctly");
10753 
10754   std::optional<MDNode *> RemainderLoopID =
10755       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10756                                       LLVMLoopVectorizeFollowupEpilogue});
10757   if (RemainderLoopID) {
10758     L->setLoopID(*RemainderLoopID);
10759   } else {
10760     if (DisableRuntimeUnroll)
10761       addRuntimeUnrollDisableMetaData(L);
10762 
10763     // Mark the loop as already vectorized to avoid vectorizing again.
10764     Hints.setAlreadyVectorized();
10765   }
10766 
10767   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10768   return true;
10769 }
10770 
10771 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10772 
10773   // Don't attempt if
10774   // 1. the target claims to have no vector registers, and
10775   // 2. interleaving won't help ILP.
10776   //
10777   // The second condition is necessary because, even if the target has no
10778   // vector registers, loop vectorization may still enable scalar
10779   // interleaving.
10780   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10781       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10782     return LoopVectorizeResult(false, false);
10783 
10784   bool Changed = false, CFGChanged = false;
10785 
10786   // The vectorizer requires loops to be in simplified form.
10787   // Since simplification may add new inner loops, it has to run before the
10788   // legality and profitability checks. This means running the loop vectorizer
10789   // will simplify all loops, regardless of whether anything end up being
10790   // vectorized.
10791   for (const auto &L : *LI)
10792     Changed |= CFGChanged |=
10793         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10794 
10795   // Build up a worklist of inner-loops to vectorize. This is necessary as
10796   // the act of vectorizing or partially unrolling a loop creates new loops
10797   // and can invalidate iterators across the loops.
10798   SmallVector<Loop *, 8> Worklist;
10799 
10800   for (Loop *L : *LI)
10801     collectSupportedLoops(*L, LI, ORE, Worklist);
10802 
10803   LoopsAnalyzed += Worklist.size();
10804 
10805   // Now walk the identified inner loops.
10806   while (!Worklist.empty()) {
10807     Loop *L = Worklist.pop_back_val();
10808 
10809     // For the inner loops we actually process, form LCSSA to simplify the
10810     // transform.
10811     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10812 
10813     Changed |= CFGChanged |= processLoop(L);
10814 
10815     if (Changed) {
10816       LAIs->clear();
10817 
10818 #ifndef NDEBUG
10819       if (VerifySCEV)
10820         SE->verify();
10821 #endif
10822     }
10823   }
10824 
10825   // Process each loop nest in the function.
10826   return LoopVectorizeResult(Changed, CFGChanged);
10827 }
10828 
10829 PreservedAnalyses LoopVectorizePass::run(Function &F,
10830                                          FunctionAnalysisManager &AM) {
10831   LI = &AM.getResult<LoopAnalysis>(F);
10832   // There are no loops in the function. Return before computing other
10833   // expensive analyses.
10834   if (LI->empty())
10835     return PreservedAnalyses::all();
10836   SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
10837   TTI = &AM.getResult<TargetIRAnalysis>(F);
10838   DT = &AM.getResult<DominatorTreeAnalysis>(F);
10839   TLI = &AM.getResult<TargetLibraryAnalysis>(F);
10840   AC = &AM.getResult<AssumptionAnalysis>(F);
10841   DB = &AM.getResult<DemandedBitsAnalysis>(F);
10842   ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10843   LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10844 
10845   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10846   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10847   BFI = nullptr;
10848   if (PSI && PSI->hasProfileSummary())
10849     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10850   LoopVectorizeResult Result = runImpl(F);
10851   if (!Result.MadeAnyChange)
10852     return PreservedAnalyses::all();
10853   PreservedAnalyses PA;
10854 
10855   if (isAssignmentTrackingEnabled(*F.getParent())) {
10856     for (auto &BB : F)
10857       RemoveRedundantDbgInstrs(&BB);
10858   }
10859 
10860   PA.preserve<LoopAnalysis>();
10861   PA.preserve<DominatorTreeAnalysis>();
10862   PA.preserve<ScalarEvolutionAnalysis>();
10863   PA.preserve<LoopAccessAnalysis>();
10864 
10865   if (Result.MadeCFGChange) {
10866     // Making CFG changes likely means a loop got vectorized. Indicate that
10867     // extra simplification passes should be run.
10868     // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10869     // be run if runtime checks have been added.
10870     AM.getResult<ShouldRunExtraVectorPasses>(F);
10871     PA.preserve<ShouldRunExtraVectorPasses>();
10872   } else {
10873     PA.preserveSet<CFGAnalyses>();
10874   }
10875   return PA;
10876 }
10877 
10878 void LoopVectorizePass::printPipeline(
10879     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10880   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10881       OS, MapClassName2PassName);
10882 
10883   OS << '<';
10884   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10885   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10886   OS << '>';
10887 }
10888