xref: /llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision edf3a55bcecc8b0441a7a5fe6bda2023f86667a3)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanPatternMatch.h"
63 #include "VPlanTransforms.h"
64 #include "VPlanUtils.h"
65 #include "VPlanVerifier.h"
66 #include "llvm/ADT/APInt.h"
67 #include "llvm/ADT/ArrayRef.h"
68 #include "llvm/ADT/DenseMap.h"
69 #include "llvm/ADT/DenseMapInfo.h"
70 #include "llvm/ADT/Hashing.h"
71 #include "llvm/ADT/MapVector.h"
72 #include "llvm/ADT/STLExtras.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/TypeSwitch.h"
79 #include "llvm/ADT/iterator_range.h"
80 #include "llvm/Analysis/AssumptionCache.h"
81 #include "llvm/Analysis/BasicAliasAnalysis.h"
82 #include "llvm/Analysis/BlockFrequencyInfo.h"
83 #include "llvm/Analysis/CFG.h"
84 #include "llvm/Analysis/CodeMetrics.h"
85 #include "llvm/Analysis/DemandedBits.h"
86 #include "llvm/Analysis/GlobalsModRef.h"
87 #include "llvm/Analysis/LoopAccessAnalysis.h"
88 #include "llvm/Analysis/LoopAnalysisManager.h"
89 #include "llvm/Analysis/LoopInfo.h"
90 #include "llvm/Analysis/LoopIterator.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/ValueTracking.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfo.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/MDBuilder.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/PatternMatch.h"
122 #include "llvm/IR/ProfDataUtils.h"
123 #include "llvm/IR/Type.h"
124 #include "llvm/IR/Use.h"
125 #include "llvm/IR/User.h"
126 #include "llvm/IR/Value.h"
127 #include "llvm/IR/Verifier.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/NativeFormatting.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/Local.h"
139 #include "llvm/Transforms/Utils/LoopSimplify.h"
140 #include "llvm/Transforms/Utils/LoopUtils.h"
141 #include "llvm/Transforms/Utils/LoopVersioning.h"
142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143 #include "llvm/Transforms/Utils/SizeOpts.h"
144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <algorithm>
146 #include <cassert>
147 #include <cstdint>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251                    "data-and-control-without-rt-check",
252                    "Similar to data-and-control, but remove the runtime check"),
253         clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254                    "Use predicated EVL instructions for tail folding. If EVL "
255                    "is unsupported, fallback to data-without-lane-mask.")));
256 
257 static cl::opt<bool> MaximizeBandwidth(
258     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259     cl::desc("Maximize bandwidth when selecting vectorization factor which "
260              "will be determined by the smallest type in loop."));
261 
262 static cl::opt<bool> EnableInterleavedMemAccesses(
263     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265 
266 /// An interleave-group may need masking if it resides in a block that needs
267 /// predication, or in order to mask away gaps.
268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
269     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271 
272 static cl::opt<unsigned> ForceTargetNumScalarRegs(
273     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's number of scalar registers."));
275 
276 static cl::opt<unsigned> ForceTargetNumVectorRegs(
277     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278     cl::desc("A flag that overrides the target's number of vector registers."));
279 
280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
281     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282     cl::desc("A flag that overrides the target's max interleave factor for "
283              "scalar loops."));
284 
285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
286     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287     cl::desc("A flag that overrides the target's max interleave factor for "
288              "vectorized loops."));
289 
290 cl::opt<unsigned> ForceTargetInstructionCost(
291     "force-target-instruction-cost", cl::init(0), cl::Hidden,
292     cl::desc("A flag that overrides the target's expected cost for "
293              "an instruction to a single constant value. Mostly "
294              "useful for getting consistent testing."));
295 
296 static cl::opt<bool> ForceTargetSupportsScalableVectors(
297     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298     cl::desc(
299         "Pretend that scalable vectors are supported, even if the target does "
300         "not support them. This flag should only be used for testing."));
301 
302 static cl::opt<unsigned> SmallLoopCost(
303     "small-loop-cost", cl::init(20), cl::Hidden,
304     cl::desc(
305         "The cost of a loop that is considered 'small' by the interleaver."));
306 
307 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
308     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309     cl::desc("Enable the use of the block frequency analysis to access PGO "
310              "heuristics minimizing code growth in cold regions and being more "
311              "aggressive in hot regions."));
312 
313 // Runtime interleave loops for load/store throughput.
314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
315     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316     cl::desc(
317         "Enable runtime interleaving until load/store ports are saturated"));
318 
319 /// The number of stores in a loop that are allowed to need predication.
320 static cl::opt<unsigned> NumberOfStoresToPredicate(
321     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322     cl::desc("Max number of stores to be predicated behind an if."));
323 
324 static cl::opt<bool> EnableIndVarRegisterHeur(
325     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326     cl::desc("Count the induction variable only once when interleaving"));
327 
328 static cl::opt<bool> EnableCondStoresVectorization(
329     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330     cl::desc("Enable if predication of stores during vectorization."));
331 
332 static cl::opt<unsigned> MaxNestedScalarReductionIC(
333     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334     cl::desc("The maximum interleave count to use when interleaving a scalar "
335              "reduction in a nested loop."));
336 
337 static cl::opt<bool>
338     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
339                            cl::Hidden,
340                            cl::desc("Prefer in-loop vector reductions, "
341                                     "overriding the targets preference."));
342 
343 static cl::opt<bool> ForceOrderedReductions(
344     "force-ordered-reductions", cl::init(false), cl::Hidden,
345     cl::desc("Enable the vectorisation of loops with in-order (strict) "
346              "FP reductions"));
347 
348 static cl::opt<bool> PreferPredicatedReductionSelect(
349     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350     cl::desc(
351         "Prefer predicating a reduction operation over an after loop select."));
352 
353 namespace llvm {
354 cl::opt<bool> EnableVPlanNativePath(
355     "enable-vplan-native-path", cl::Hidden,
356     cl::desc("Enable VPlan-native vectorization path with "
357              "support for outer loop vectorization."));
358 } // namespace llvm
359 
360 // This flag enables the stress testing of the VPlan H-CFG construction in the
361 // VPlan-native vectorization path. It must be used in conjuction with
362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363 // verification of the H-CFGs built.
364 static cl::opt<bool> VPlanBuildStressTest(
365     "vplan-build-stress-test", cl::init(false), cl::Hidden,
366     cl::desc(
367         "Build VPlan for every supported loop nest in the function and bail "
368         "out right after the build (stress test the VPlan H-CFG construction "
369         "in the VPlan-native vectorization path)."));
370 
371 cl::opt<bool> llvm::EnableLoopInterleaving(
372     "interleave-loops", cl::init(true), cl::Hidden,
373     cl::desc("Enable loop interleaving in Loop vectorization passes"));
374 cl::opt<bool> llvm::EnableLoopVectorization(
375     "vectorize-loops", cl::init(true), cl::Hidden,
376     cl::desc("Run the Loop vectorization passes"));
377 
378 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
379     "force-widen-divrem-via-safe-divisor", cl::Hidden,
380     cl::desc(
381         "Override cost based safe divisor widening for div/rem instructions"));
382 
383 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
384     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
385     cl::Hidden,
386     cl::desc("Try wider VFs if they enable the use of vector variants"));
387 
388 static cl::opt<bool> EnableEarlyExitVectorization(
389     "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390     cl::desc(
391         "Enable vectorization of early exit loops with uncountable exits."));
392 
393 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394 // variables not overflowing do not hold. See `emitSCEVChecks`.
395 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
397 // `emitMemRuntimeChecks`.
398 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399 // Likelyhood of bypassing the vectorized loop because there are zero trips left
400 // after prolog. See `emitIterationCountCheck`.
401 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402 
403 /// A helper function that returns true if the given type is irregular. The
404 /// type is irregular if its allocated size doesn't equal the store size of an
405 /// element of the corresponding vector type.
406 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407   // Determine if an array of N elements of type Ty is "bitcast compatible"
408   // with a <N x Ty> vector.
409   // This is only true if there is no padding between the array elements.
410   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411 }
412 
413 /// Returns "best known" trip count for the specified loop \p L as defined by
414 /// the following procedure:
415 ///   1) Returns exact trip count if it is known.
416 ///   2) Returns expected trip count according to profile data if any.
417 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
418 ///   4) Returns std::nullopt if all of the above failed.
419 static std::optional<unsigned>
420 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
421                     bool CanUseConstantMax = true) {
422   // Check if exact trip count is known.
423   if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
424     return ExpectedTC;
425 
426   // Check if there is an expected trip count available from profile data.
427   if (LoopVectorizeWithBlockFrequency)
428     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
429       return *EstimatedTC;
430 
431   if (!CanUseConstantMax)
432     return std::nullopt;
433 
434   // Check if upper bound estimate is known.
435   if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
436     return ExpectedTC;
437 
438   return std::nullopt;
439 }
440 
441 namespace {
442 // Forward declare GeneratedRTChecks.
443 class GeneratedRTChecks;
444 
445 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446 } // namespace
447 
448 namespace llvm {
449 
450 AnalysisKey ShouldRunExtraVectorPasses::Key;
451 
452 /// InnerLoopVectorizer vectorizes loops which contain only one basic
453 /// block to a specified vectorization factor (VF).
454 /// This class performs the widening of scalars into vectors, or multiple
455 /// scalars. This class also implements the following features:
456 /// * It inserts an epilogue loop for handling loops that don't have iteration
457 ///   counts that are known to be a multiple of the vectorization factor.
458 /// * It handles the code generation for reduction variables.
459 /// * Scalarization (implementation using scalars) of un-vectorizable
460 ///   instructions.
461 /// InnerLoopVectorizer does not perform any vectorization-legality
462 /// checks, and relies on the caller to check for the different legality
463 /// aspects. The InnerLoopVectorizer relies on the
464 /// LoopVectorizationLegality class to provide information about the induction
465 /// and reduction variables that were found to a given vectorization factor.
466 class InnerLoopVectorizer {
467 public:
468   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
469                       LoopInfo *LI, DominatorTree *DT,
470                       const TargetLibraryInfo *TLI,
471                       const TargetTransformInfo *TTI, AssumptionCache *AC,
472                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
473                       ElementCount MinProfitableTripCount,
474                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
475                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
476                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
477                       VPlan &Plan)
478       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
479         AC(AC), ORE(ORE), VF(VecWidth),
480         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
481         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
482         PSI(PSI), RTChecks(RTChecks), Plan(Plan),
483         VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
484     // Query this against the original loop and save it here because the profile
485     // of the original loop header may change as the transformation happens.
486     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
487         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
488   }
489 
490   virtual ~InnerLoopVectorizer() = default;
491 
492   /// Create a new empty loop that will contain vectorized instructions later
493   /// on, while the old loop will be used as the scalar remainder. Control flow
494   /// is generated around the vectorized (and scalar epilogue) loops consisting
495   /// of various checks and bypasses. Return the pre-header block of the new
496   /// loop. In the case of epilogue vectorization, this function is overriden to
497   /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
498   /// used to look up SCEV expansions for expressions needed during skeleton
499   /// creation.
500   virtual BasicBlock *
501   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502 
503   /// Fix the vectorized code, taking care of header phi's, and more.
504   void fixVectorizedLoop(VPTransformState &State);
505 
506   // Return true if any runtime check is added.
507   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
508 
509   /// A helper function to scalarize a single Instruction in the innermost loop.
510   /// Generates a sequence of scalar instances for each lane between \p MinLane
511   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
512   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
513   /// Instr's operands.
514   void scalarizeInstruction(const Instruction *Instr,
515                             VPReplicateRecipe *RepRecipe, const VPLane &Lane,
516                             VPTransformState &State);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPTransformState &State);
520 
521   /// Returns the original loop trip count.
522   Value *getTripCount() const { return TripCount; }
523 
524   /// Used to set the trip count after ILV's construction and after the
525   /// preheader block has been executed. Note that this always holds the trip
526   /// count of the original loop for both main loop and epilogue vectorization.
527   void setTripCount(Value *TC) { TripCount = TC; }
528 
529   // Retrieve the additional bypass value associated with an original
530   /// induction header phi.
531   Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
532     return Induction2AdditionalBypassValue.at(OrigPhi);
533   }
534 
535   /// Return the additional bypass block which targets the scalar loop by
536   /// skipping the epilogue loop after completing the main loop.
537   BasicBlock *getAdditionalBypassBlock() const {
538     assert(AdditionalBypassBlock &&
539            "Trying to access AdditionalBypassBlock but it has not been set");
540     return AdditionalBypassBlock;
541   }
542 
543 protected:
544   friend class LoopVectorizationPlanner;
545 
546   /// Set up the values of the IVs correctly when exiting the vector loop.
547   virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
548                             Value *VectorTripCount, BasicBlock *MiddleBlock,
549                             VPTransformState &State);
550 
551   /// Iteratively sink the scalarized operands of a predicated instruction into
552   /// the block that was created for it.
553   void sinkScalarOperands(Instruction *PredInst);
554 
555   /// Returns (and creates if needed) the trip count of the widened loop.
556   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
557 
558   /// Emit a bypass check to see if the vector trip count is zero, including if
559   /// it overflows.
560   void emitIterationCountCheck(BasicBlock *Bypass);
561 
562   /// Emit a bypass check to see if all of the SCEV assumptions we've
563   /// had to make are correct. Returns the block containing the checks or
564   /// nullptr if no checks have been added.
565   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
566 
567   /// Emit bypass checks to check any memory assumptions we may have made.
568   /// Returns the block containing the checks or nullptr if no checks have been
569   /// added.
570   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
571 
572   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
573   /// vector loop preheader, middle block and scalar preheader.
574   void createVectorLoopSkeleton(StringRef Prefix);
575 
576   /// Create and record the values for induction variables to resume coming from
577   /// the additional bypass block.
578   void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
579                                              Value *MainVectorTripCount);
580 
581   /// Allow subclasses to override and print debug traces before/after vplan
582   /// execution, when trace information is requested.
583   virtual void printDebugTracesAtStart() {}
584   virtual void printDebugTracesAtEnd() {}
585 
586   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
587   /// vector preheader and its predecessor, also connecting the new block to the
588   /// scalar preheader.
589   void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
590 
591   /// The original loop.
592   Loop *OrigLoop;
593 
594   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
595   /// dynamic knowledge to simplify SCEV expressions and converts them to a
596   /// more usable form.
597   PredicatedScalarEvolution &PSE;
598 
599   /// Loop Info.
600   LoopInfo *LI;
601 
602   /// Dominator Tree.
603   DominatorTree *DT;
604 
605   /// Target Library Info.
606   const TargetLibraryInfo *TLI;
607 
608   /// Target Transform Info.
609   const TargetTransformInfo *TTI;
610 
611   /// Assumption Cache.
612   AssumptionCache *AC;
613 
614   /// Interface to emit optimization remarks.
615   OptimizationRemarkEmitter *ORE;
616 
617   /// The vectorization SIMD factor to use. Each vector will have this many
618   /// vector elements.
619   ElementCount VF;
620 
621   ElementCount MinProfitableTripCount;
622 
623   /// The vectorization unroll factor to use. Each scalar is vectorized to this
624   /// many different vector instructions.
625   unsigned UF;
626 
627   /// The builder that we use
628   IRBuilder<> Builder;
629 
630   // --- Vectorization state ---
631 
632   /// The vector-loop preheader.
633   BasicBlock *LoopVectorPreHeader;
634 
635   /// The scalar-loop preheader.
636   BasicBlock *LoopScalarPreHeader;
637 
638   /// Middle Block between the vector and the scalar.
639   BasicBlock *LoopMiddleBlock;
640 
641   /// A list of all bypass blocks. The first block is the entry of the loop.
642   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
643 
644   /// Store instructions that were predicated.
645   SmallVector<Instruction *, 4> PredicatedInstructions;
646 
647   /// Trip count of the original loop.
648   Value *TripCount = nullptr;
649 
650   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
651   Value *VectorTripCount = nullptr;
652 
653   /// The legality analysis.
654   LoopVectorizationLegality *Legal;
655 
656   /// The profitablity analysis.
657   LoopVectorizationCostModel *Cost;
658 
659   // Record whether runtime checks are added.
660   bool AddedSafetyChecks = false;
661 
662   /// BFI and PSI are used to check for profile guided size optimizations.
663   BlockFrequencyInfo *BFI;
664   ProfileSummaryInfo *PSI;
665 
666   // Whether this loop should be optimized for size based on profile guided size
667   // optimizatios.
668   bool OptForSizeBasedOnProfile;
669 
670   /// Structure to hold information about generated runtime checks, responsible
671   /// for cleaning the checks, if vectorization turns out unprofitable.
672   GeneratedRTChecks &RTChecks;
673 
674   /// Mapping of induction phis to their additional bypass values. They
675   /// need to be added as operands to phi nodes in the scalar loop preheader
676   /// after the epilogue skeleton has been created.
677   DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
678 
679   /// The additional bypass block which conditionally skips over the epilogue
680   /// loop after executing the main loop. Needed to resume inductions and
681   /// reductions during epilogue vectorization.
682   BasicBlock *AdditionalBypassBlock = nullptr;
683 
684   VPlan &Plan;
685 
686   /// The vector preheader block of \p Plan, used as target for check blocks
687   /// introduced during skeleton creation.
688   VPBlockBase *VectorPHVPB;
689 };
690 
691 /// Encapsulate information regarding vectorization of a loop and its epilogue.
692 /// This information is meant to be updated and used across two stages of
693 /// epilogue vectorization.
694 struct EpilogueLoopVectorizationInfo {
695   ElementCount MainLoopVF = ElementCount::getFixed(0);
696   unsigned MainLoopUF = 0;
697   ElementCount EpilogueVF = ElementCount::getFixed(0);
698   unsigned EpilogueUF = 0;
699   BasicBlock *MainLoopIterationCountCheck = nullptr;
700   BasicBlock *EpilogueIterationCountCheck = nullptr;
701   BasicBlock *SCEVSafetyCheck = nullptr;
702   BasicBlock *MemSafetyCheck = nullptr;
703   Value *TripCount = nullptr;
704   Value *VectorTripCount = nullptr;
705   VPlan &EpiloguePlan;
706 
707   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
708                                 ElementCount EVF, unsigned EUF,
709                                 VPlan &EpiloguePlan)
710       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
711         EpiloguePlan(EpiloguePlan) {
712     assert(EUF == 1 &&
713            "A high UF for the epilogue loop is likely not beneficial.");
714   }
715 };
716 
717 /// An extension of the inner loop vectorizer that creates a skeleton for a
718 /// vectorized loop that has its epilogue (residual) also vectorized.
719 /// The idea is to run the vplan on a given loop twice, firstly to setup the
720 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
721 /// from the first step and vectorize the epilogue.  This is achieved by
722 /// deriving two concrete strategy classes from this base class and invoking
723 /// them in succession from the loop vectorizer planner.
724 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
725 public:
726   InnerLoopAndEpilogueVectorizer(
727       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
728       DominatorTree *DT, const TargetLibraryInfo *TLI,
729       const TargetTransformInfo *TTI, AssumptionCache *AC,
730       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
731       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
732       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
733       GeneratedRTChecks &Checks, VPlan &Plan)
734       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
735                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
736                             CM, BFI, PSI, Checks, Plan),
737         EPI(EPI) {}
738 
739   // Override this function to handle the more complex control flow around the
740   // three loops.
741   BasicBlock *
742   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
743     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
744   }
745 
746   /// The interface for creating a vectorized skeleton using one of two
747   /// different strategies, each corresponding to one execution of the vplan
748   /// as described above.
749   virtual BasicBlock *
750   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
751 
752   /// Holds and updates state information required to vectorize the main loop
753   /// and its epilogue in two separate passes. This setup helps us avoid
754   /// regenerating and recomputing runtime safety checks. It also helps us to
755   /// shorten the iteration-count-check path length for the cases where the
756   /// iteration count of the loop is so small that the main vector loop is
757   /// completely skipped.
758   EpilogueLoopVectorizationInfo &EPI;
759 };
760 
761 /// A specialized derived class of inner loop vectorizer that performs
762 /// vectorization of *main* loops in the process of vectorizing loops and their
763 /// epilogues.
764 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
765 public:
766   EpilogueVectorizerMainLoop(
767       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
768       DominatorTree *DT, const TargetLibraryInfo *TLI,
769       const TargetTransformInfo *TTI, AssumptionCache *AC,
770       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
771       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
772       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
773       GeneratedRTChecks &Check, VPlan &Plan)
774       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
775                                        EPI, LVL, CM, BFI, PSI, Check, Plan) {}
776   /// Implements the interface for creating a vectorized skeleton using the
777   /// *main loop* strategy (ie the first pass of vplan execution).
778   BasicBlock *
779   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
780 
781 protected:
782   /// Emits an iteration count bypass check once for the main loop (when \p
783   /// ForEpilogue is false) and once for the epilogue loop (when \p
784   /// ForEpilogue is true).
785   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
786   void printDebugTracesAtStart() override;
787   void printDebugTracesAtEnd() override;
788 
789   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
790                     Value *VectorTripCount, BasicBlock *MiddleBlock,
791                     VPTransformState &State) override {};
792 };
793 
794 // A specialized derived class of inner loop vectorizer that performs
795 // vectorization of *epilogue* loops in the process of vectorizing loops and
796 // their epilogues.
797 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
798 public:
799   EpilogueVectorizerEpilogueLoop(
800       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
801       DominatorTree *DT, const TargetLibraryInfo *TLI,
802       const TargetTransformInfo *TTI, AssumptionCache *AC,
803       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
804       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
805       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
806       GeneratedRTChecks &Checks, VPlan &Plan)
807       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
808                                        EPI, LVL, CM, BFI, PSI, Checks, Plan) {
809     TripCount = EPI.TripCount;
810   }
811   /// Implements the interface for creating a vectorized skeleton using the
812   /// *epilogue loop* strategy (ie the second pass of vplan execution).
813   BasicBlock *
814   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
815 
816 protected:
817   /// Emits an iteration count bypass check after the main vector loop has
818   /// finished to see if there are any iterations left to execute by either
819   /// the vector epilogue or the scalar epilogue.
820   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
821                                                       BasicBlock *Bypass,
822                                                       BasicBlock *Insert);
823   void printDebugTracesAtStart() override;
824   void printDebugTracesAtEnd() override;
825 };
826 } // end namespace llvm
827 
828 /// Look for a meaningful debug location on the instruction or its operands.
829 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
830   if (!I)
831     return DebugLoc();
832 
833   DebugLoc Empty;
834   if (I->getDebugLoc() != Empty)
835     return I->getDebugLoc();
836 
837   for (Use &Op : I->operands()) {
838     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
839       if (OpInst->getDebugLoc() != Empty)
840         return OpInst->getDebugLoc();
841   }
842 
843   return I->getDebugLoc();
844 }
845 
846 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
847 /// is passed, the message relates to that particular instruction.
848 #ifndef NDEBUG
849 static void debugVectorizationMessage(const StringRef Prefix,
850                                       const StringRef DebugMsg,
851                                       Instruction *I) {
852   dbgs() << "LV: " << Prefix << DebugMsg;
853   if (I != nullptr)
854     dbgs() << " " << *I;
855   else
856     dbgs() << '.';
857   dbgs() << '\n';
858 }
859 #endif
860 
861 /// Create an analysis remark that explains why vectorization failed
862 ///
863 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
864 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
865 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
866 /// the location of the remark. If \p DL is passed, use it as debug location for
867 /// the remark. \return the remark object that can be streamed to.
868 static OptimizationRemarkAnalysis
869 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
870                  Instruction *I, DebugLoc DL = {}) {
871   Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
872   // If debug location is attached to the instruction, use it. Otherwise if DL
873   // was not provided, use the loop's.
874   if (I && I->getDebugLoc())
875     DL = I->getDebugLoc();
876   else if (!DL)
877     DL = TheLoop->getStartLoc();
878 
879   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
880 }
881 
882 namespace llvm {
883 
884 /// Return a value for Step multiplied by VF.
885 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
886                        int64_t Step) {
887   assert(Ty->isIntegerTy() && "Expected an integer step");
888   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
889 }
890 
891 /// Return the runtime value for VF.
892 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
893   return B.CreateElementCount(Ty, VF);
894 }
895 
896 void reportVectorizationFailure(const StringRef DebugMsg,
897                                 const StringRef OREMsg, const StringRef ORETag,
898                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
899                                 Instruction *I) {
900   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
901   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
902   ORE->emit(
903       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
904       << "loop not vectorized: " << OREMsg);
905 }
906 
907 /// Reports an informative message: print \p Msg for debugging purposes as well
908 /// as an optimization remark. Uses either \p I as location of the remark, or
909 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
910 /// remark. If \p DL is passed, use it as debug location for the remark.
911 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
912                                     OptimizationRemarkEmitter *ORE,
913                                     Loop *TheLoop, Instruction *I = nullptr,
914                                     DebugLoc DL = {}) {
915   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
916   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
917   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
918                              I, DL)
919             << Msg);
920 }
921 
922 /// Report successful vectorization of the loop. In case an outer loop is
923 /// vectorized, prepend "outer" to the vectorization remark.
924 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
925                                 VectorizationFactor VF, unsigned IC) {
926   LLVM_DEBUG(debugVectorizationMessage(
927       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
928       nullptr));
929   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
930   ORE->emit([&]() {
931     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
932                               TheLoop->getHeader())
933            << "vectorized " << LoopType << "loop (vectorization width: "
934            << ore::NV("VectorizationFactor", VF.Width)
935            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
936   });
937 }
938 
939 } // end namespace llvm
940 
941 namespace llvm {
942 
943 // Loop vectorization cost-model hints how the scalar epilogue loop should be
944 // lowered.
945 enum ScalarEpilogueLowering {
946 
947   // The default: allowing scalar epilogues.
948   CM_ScalarEpilogueAllowed,
949 
950   // Vectorization with OptForSize: don't allow epilogues.
951   CM_ScalarEpilogueNotAllowedOptSize,
952 
953   // A special case of vectorisation with OptForSize: loops with a very small
954   // trip count are considered for vectorization under OptForSize, thereby
955   // making sure the cost of their loop body is dominant, free of runtime
956   // guards and scalar iteration overheads.
957   CM_ScalarEpilogueNotAllowedLowTripLoop,
958 
959   // Loop hint predicate indicating an epilogue is undesired.
960   CM_ScalarEpilogueNotNeededUsePredicate,
961 
962   // Directive indicating we must either tail fold or not vectorize
963   CM_ScalarEpilogueNotAllowedUsePredicate
964 };
965 
966 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
967 
968 /// LoopVectorizationCostModel - estimates the expected speedups due to
969 /// vectorization.
970 /// In many cases vectorization is not profitable. This can happen because of
971 /// a number of reasons. In this class we mainly attempt to predict the
972 /// expected speedup/slowdowns due to the supported instruction set. We use the
973 /// TargetTransformInfo to query the different backends for the cost of
974 /// different operations.
975 class LoopVectorizationCostModel {
976   friend class LoopVectorizationPlanner;
977 
978 public:
979   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
980                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
981                              LoopVectorizationLegality *Legal,
982                              const TargetTransformInfo &TTI,
983                              const TargetLibraryInfo *TLI, DemandedBits *DB,
984                              AssumptionCache *AC,
985                              OptimizationRemarkEmitter *ORE, const Function *F,
986                              const LoopVectorizeHints *Hints,
987                              InterleavedAccessInfo &IAI)
988       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
989         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
990         Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
991 
992   /// \return An upper bound for the vectorization factors (both fixed and
993   /// scalable). If the factors are 0, vectorization and interleaving should be
994   /// avoided up front.
995   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
996 
997   /// \return True if runtime checks are required for vectorization, and false
998   /// otherwise.
999   bool runtimeChecksRequired();
1000 
1001   /// Setup cost-based decisions for user vectorization factor.
1002   /// \return true if the UserVF is a feasible VF to be chosen.
1003   bool selectUserVectorizationFactor(ElementCount UserVF) {
1004     collectUniformsAndScalars(UserVF);
1005     collectInstsToScalarize(UserVF);
1006     return expectedCost(UserVF).isValid();
1007   }
1008 
1009   /// \return The size (in bits) of the smallest and widest types in the code
1010   /// that needs to be vectorized. We ignore values that remain scalar such as
1011   /// 64 bit loop indices.
1012   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1013 
1014   /// \return The desired interleave count.
1015   /// If interleave count has been specified by metadata it will be returned.
1016   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1017   /// are the selected vectorization factor and the cost of the selected VF.
1018   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1019 
1020   /// Memory access instruction may be vectorized in more than one way.
1021   /// Form of instruction after vectorization depends on cost.
1022   /// This function takes cost-based decisions for Load/Store instructions
1023   /// and collects them in a map. This decisions map is used for building
1024   /// the lists of loop-uniform and loop-scalar instructions.
1025   /// The calculated cost is saved with widening decision in order to
1026   /// avoid redundant calculations.
1027   void setCostBasedWideningDecision(ElementCount VF);
1028 
1029   /// A call may be vectorized in different ways depending on whether we have
1030   /// vectorized variants available and whether the target supports masking.
1031   /// This function analyzes all calls in the function at the supplied VF,
1032   /// makes a decision based on the costs of available options, and stores that
1033   /// decision in a map for use in planning and plan execution.
1034   void setVectorizedCallDecision(ElementCount VF);
1035 
1036   /// A struct that represents some properties of the register usage
1037   /// of a loop.
1038   struct RegisterUsage {
1039     /// Holds the number of loop invariant values that are used in the loop.
1040     /// The key is ClassID of target-provided register class.
1041     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1042     /// Holds the maximum number of concurrent live intervals in the loop.
1043     /// The key is ClassID of target-provided register class.
1044     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1045   };
1046 
1047   /// \return Returns information about the register usages of the loop for the
1048   /// given vectorization factors.
1049   SmallVector<RegisterUsage, 8>
1050   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1051 
1052   /// Collect values we want to ignore in the cost model.
1053   void collectValuesToIgnore();
1054 
1055   /// Collect all element types in the loop for which widening is needed.
1056   void collectElementTypesForWidening();
1057 
1058   /// Split reductions into those that happen in the loop, and those that happen
1059   /// outside. In loop reductions are collected into InLoopReductions.
1060   void collectInLoopReductions();
1061 
1062   /// Returns true if we should use strict in-order reductions for the given
1063   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1064   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1065   /// of FP operations.
1066   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1067     return !Hints->allowReordering() && RdxDesc.isOrdered();
1068   }
1069 
1070   /// \returns The smallest bitwidth each instruction can be represented with.
1071   /// The vector equivalents of these instructions should be truncated to this
1072   /// type.
1073   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1074     return MinBWs;
1075   }
1076 
1077   /// \returns True if it is more profitable to scalarize instruction \p I for
1078   /// vectorization factor \p VF.
1079   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1080     assert(VF.isVector() &&
1081            "Profitable to scalarize relevant only for VF > 1.");
1082     assert(
1083         TheLoop->isInnermost() &&
1084         "cost-model should not be used for outer loops (in VPlan-native path)");
1085 
1086     auto Scalars = InstsToScalarize.find(VF);
1087     assert(Scalars != InstsToScalarize.end() &&
1088            "VF not yet analyzed for scalarization profitability");
1089     return Scalars->second.contains(I);
1090   }
1091 
1092   /// Returns true if \p I is known to be uniform after vectorization.
1093   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1094     assert(
1095         TheLoop->isInnermost() &&
1096         "cost-model should not be used for outer loops (in VPlan-native path)");
1097     // Pseudo probe needs to be duplicated for each unrolled iteration and
1098     // vector lane so that profiled loop trip count can be accurately
1099     // accumulated instead of being under counted.
1100     if (isa<PseudoProbeInst>(I))
1101       return false;
1102 
1103     if (VF.isScalar())
1104       return true;
1105 
1106     auto UniformsPerVF = Uniforms.find(VF);
1107     assert(UniformsPerVF != Uniforms.end() &&
1108            "VF not yet analyzed for uniformity");
1109     return UniformsPerVF->second.count(I);
1110   }
1111 
1112   /// Returns true if \p I is known to be scalar after vectorization.
1113   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1114     assert(
1115         TheLoop->isInnermost() &&
1116         "cost-model should not be used for outer loops (in VPlan-native path)");
1117     if (VF.isScalar())
1118       return true;
1119 
1120     auto ScalarsPerVF = Scalars.find(VF);
1121     assert(ScalarsPerVF != Scalars.end() &&
1122            "Scalar values are not calculated for VF");
1123     return ScalarsPerVF->second.count(I);
1124   }
1125 
1126   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1127   /// for vectorization factor \p VF.
1128   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1129     return VF.isVector() && MinBWs.contains(I) &&
1130            !isProfitableToScalarize(I, VF) &&
1131            !isScalarAfterVectorization(I, VF);
1132   }
1133 
1134   /// Decision that was taken during cost calculation for memory instruction.
1135   enum InstWidening {
1136     CM_Unknown,
1137     CM_Widen,         // For consecutive accesses with stride +1.
1138     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1139     CM_Interleave,
1140     CM_GatherScatter,
1141     CM_Scalarize,
1142     CM_VectorCall,
1143     CM_IntrinsicCall
1144   };
1145 
1146   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1147   /// instruction \p I and vector width \p VF.
1148   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1149                            InstructionCost Cost) {
1150     assert(VF.isVector() && "Expected VF >=2");
1151     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1152   }
1153 
1154   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1155   /// interleaving group \p Grp and vector width \p VF.
1156   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1157                            ElementCount VF, InstWidening W,
1158                            InstructionCost Cost) {
1159     assert(VF.isVector() && "Expected VF >=2");
1160     /// Broadcast this decicion to all instructions inside the group.
1161     /// When interleaving, the cost will only be assigned one instruction, the
1162     /// insert position. For other cases, add the appropriate fraction of the
1163     /// total cost to each instruction. This ensures accurate costs are used,
1164     /// even if the insert position instruction is not used.
1165     InstructionCost InsertPosCost = Cost;
1166     InstructionCost OtherMemberCost = 0;
1167     if (W != CM_Interleave)
1168       OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1169     ;
1170     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1171       if (auto *I = Grp->getMember(Idx)) {
1172         if (Grp->getInsertPos() == I)
1173           WideningDecisions[std::make_pair(I, VF)] =
1174               std::make_pair(W, InsertPosCost);
1175         else
1176           WideningDecisions[std::make_pair(I, VF)] =
1177               std::make_pair(W, OtherMemberCost);
1178       }
1179     }
1180   }
1181 
1182   /// Return the cost model decision for the given instruction \p I and vector
1183   /// width \p VF. Return CM_Unknown if this instruction did not pass
1184   /// through the cost modeling.
1185   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1186     assert(VF.isVector() && "Expected VF to be a vector VF");
1187     assert(
1188         TheLoop->isInnermost() &&
1189         "cost-model should not be used for outer loops (in VPlan-native path)");
1190 
1191     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1192     auto Itr = WideningDecisions.find(InstOnVF);
1193     if (Itr == WideningDecisions.end())
1194       return CM_Unknown;
1195     return Itr->second.first;
1196   }
1197 
1198   /// Return the vectorization cost for the given instruction \p I and vector
1199   /// width \p VF.
1200   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1201     assert(VF.isVector() && "Expected VF >=2");
1202     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1203     assert(WideningDecisions.contains(InstOnVF) &&
1204            "The cost is not calculated");
1205     return WideningDecisions[InstOnVF].second;
1206   }
1207 
1208   struct CallWideningDecision {
1209     InstWidening Kind;
1210     Function *Variant;
1211     Intrinsic::ID IID;
1212     std::optional<unsigned> MaskPos;
1213     InstructionCost Cost;
1214   };
1215 
1216   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1217                                Function *Variant, Intrinsic::ID IID,
1218                                std::optional<unsigned> MaskPos,
1219                                InstructionCost Cost) {
1220     assert(!VF.isScalar() && "Expected vector VF");
1221     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1222                                                      MaskPos, Cost};
1223   }
1224 
1225   CallWideningDecision getCallWideningDecision(CallInst *CI,
1226                                                ElementCount VF) const {
1227     assert(!VF.isScalar() && "Expected vector VF");
1228     return CallWideningDecisions.at(std::make_pair(CI, VF));
1229   }
1230 
1231   /// Return True if instruction \p I is an optimizable truncate whose operand
1232   /// is an induction variable. Such a truncate will be removed by adding a new
1233   /// induction variable with the destination type.
1234   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1235     // If the instruction is not a truncate, return false.
1236     auto *Trunc = dyn_cast<TruncInst>(I);
1237     if (!Trunc)
1238       return false;
1239 
1240     // Get the source and destination types of the truncate.
1241     Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1242     Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1243 
1244     // If the truncate is free for the given types, return false. Replacing a
1245     // free truncate with an induction variable would add an induction variable
1246     // update instruction to each iteration of the loop. We exclude from this
1247     // check the primary induction variable since it will need an update
1248     // instruction regardless.
1249     Value *Op = Trunc->getOperand(0);
1250     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1251       return false;
1252 
1253     // If the truncated value is not an induction variable, return false.
1254     return Legal->isInductionPhi(Op);
1255   }
1256 
1257   /// Collects the instructions to scalarize for each predicated instruction in
1258   /// the loop.
1259   void collectInstsToScalarize(ElementCount VF);
1260 
1261   /// Collect Uniform and Scalar values for the given \p VF.
1262   /// The sets depend on CM decision for Load/Store instructions
1263   /// that may be vectorized as interleave, gather-scatter or scalarized.
1264   /// Also make a decision on what to do about call instructions in the loop
1265   /// at that VF -- scalarize, call a known vector routine, or call a
1266   /// vector intrinsic.
1267   void collectUniformsAndScalars(ElementCount VF) {
1268     // Do the analysis once.
1269     if (VF.isScalar() || Uniforms.contains(VF))
1270       return;
1271     setCostBasedWideningDecision(VF);
1272     collectLoopUniforms(VF);
1273     setVectorizedCallDecision(VF);
1274     collectLoopScalars(VF);
1275   }
1276 
1277   /// Returns true if the target machine supports masked store operation
1278   /// for the given \p DataType and kind of access to \p Ptr.
1279   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1280     return Legal->isConsecutivePtr(DataType, Ptr) &&
1281            TTI.isLegalMaskedStore(DataType, Alignment);
1282   }
1283 
1284   /// Returns true if the target machine supports masked load operation
1285   /// for the given \p DataType and kind of access to \p Ptr.
1286   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1287     return Legal->isConsecutivePtr(DataType, Ptr) &&
1288            TTI.isLegalMaskedLoad(DataType, Alignment);
1289   }
1290 
1291   /// Returns true if the target machine can represent \p V as a masked gather
1292   /// or scatter operation.
1293   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1294     bool LI = isa<LoadInst>(V);
1295     bool SI = isa<StoreInst>(V);
1296     if (!LI && !SI)
1297       return false;
1298     auto *Ty = getLoadStoreType(V);
1299     Align Align = getLoadStoreAlignment(V);
1300     if (VF.isVector())
1301       Ty = VectorType::get(Ty, VF);
1302     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1303            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1304   }
1305 
1306   /// Returns true if the target machine supports all of the reduction
1307   /// variables found for the given VF.
1308   bool canVectorizeReductions(ElementCount VF) const {
1309     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1310       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1311       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1312     }));
1313   }
1314 
1315   /// Given costs for both strategies, return true if the scalar predication
1316   /// lowering should be used for div/rem.  This incorporates an override
1317   /// option so it is not simply a cost comparison.
1318   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1319                                      InstructionCost SafeDivisorCost) const {
1320     switch (ForceSafeDivisor) {
1321     case cl::BOU_UNSET:
1322       return ScalarCost < SafeDivisorCost;
1323     case cl::BOU_TRUE:
1324       return false;
1325     case cl::BOU_FALSE:
1326       return true;
1327     }
1328     llvm_unreachable("impossible case value");
1329   }
1330 
1331   /// Returns true if \p I is an instruction which requires predication and
1332   /// for which our chosen predication strategy is scalarization (i.e. we
1333   /// don't have an alternate strategy such as masking available).
1334   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1335   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1336 
1337   /// Returns true if \p I is an instruction that needs to be predicated
1338   /// at runtime.  The result is independent of the predication mechanism.
1339   /// Superset of instructions that return true for isScalarWithPredication.
1340   bool isPredicatedInst(Instruction *I) const;
1341 
1342   /// Return the costs for our two available strategies for lowering a
1343   /// div/rem operation which requires speculating at least one lane.
1344   /// First result is for scalarization (will be invalid for scalable
1345   /// vectors); second is for the safe-divisor strategy.
1346   std::pair<InstructionCost, InstructionCost>
1347   getDivRemSpeculationCost(Instruction *I,
1348                            ElementCount VF) const;
1349 
1350   /// Returns true if \p I is a memory instruction with consecutive memory
1351   /// access that can be widened.
1352   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1353 
1354   /// Returns true if \p I is a memory instruction in an interleaved-group
1355   /// of memory accesses that can be vectorized with wide vector loads/stores
1356   /// and shuffles.
1357   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1358 
1359   /// Check if \p Instr belongs to any interleaved access group.
1360   bool isAccessInterleaved(Instruction *Instr) const {
1361     return InterleaveInfo.isInterleaved(Instr);
1362   }
1363 
1364   /// Get the interleaved access group that \p Instr belongs to.
1365   const InterleaveGroup<Instruction> *
1366   getInterleavedAccessGroup(Instruction *Instr) const {
1367     return InterleaveInfo.getInterleaveGroup(Instr);
1368   }
1369 
1370   /// Returns true if we're required to use a scalar epilogue for at least
1371   /// the final iteration of the original loop.
1372   bool requiresScalarEpilogue(bool IsVectorizing) const {
1373     if (!isScalarEpilogueAllowed()) {
1374       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1375       return false;
1376     }
1377     // If we might exit from anywhere but the latch and early exit vectorization
1378     // is disabled, we must run the exiting iteration in scalar form.
1379     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1380         !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1381       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1382                            "from latch block\n");
1383       return true;
1384     }
1385     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1386       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1387                            "interleaved group requires scalar epilogue\n");
1388       return true;
1389     }
1390     LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1391     return false;
1392   }
1393 
1394   /// Returns true if we're required to use a scalar epilogue for at least
1395   /// the final iteration of the original loop for all VFs in \p Range.
1396   /// A scalar epilogue must either be required for all VFs in \p Range or for
1397   /// none.
1398   bool requiresScalarEpilogue(VFRange Range) const {
1399     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1400       return requiresScalarEpilogue(VF.isVector());
1401     };
1402     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1403     assert(
1404         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1405         "all VFs in range must agree on whether a scalar epilogue is required");
1406     return IsRequired;
1407   }
1408 
1409   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1410   /// loop hint annotation.
1411   bool isScalarEpilogueAllowed() const {
1412     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1413   }
1414 
1415   /// Returns the TailFoldingStyle that is best for the current loop.
1416   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1417     if (!ChosenTailFoldingStyle)
1418       return TailFoldingStyle::None;
1419     return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1420                                : ChosenTailFoldingStyle->second;
1421   }
1422 
1423   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1424   /// overflow or not.
1425   /// \param IsScalableVF true if scalable vector factors enabled.
1426   /// \param UserIC User specific interleave count.
1427   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1428     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1429     if (!Legal->canFoldTailByMasking()) {
1430       ChosenTailFoldingStyle =
1431           std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1432       return;
1433     }
1434 
1435     if (!ForceTailFoldingStyle.getNumOccurrences()) {
1436       ChosenTailFoldingStyle = std::make_pair(
1437           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1438           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1439       return;
1440     }
1441 
1442     // Set styles when forced.
1443     ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1444                                             ForceTailFoldingStyle.getValue());
1445     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1446       return;
1447     // Override forced styles if needed.
1448     // FIXME: use actual opcode/data type for analysis here.
1449     // FIXME: Investigate opportunity for fixed vector factor.
1450     bool EVLIsLegal =
1451         UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1452         !EnableVPlanNativePath &&
1453         // FIXME: remove this once fixed-ordered recurrence is supported.
1454         Legal->getFixedOrderRecurrences().empty();
1455     if (!EVLIsLegal) {
1456       // If for some reason EVL mode is unsupported, fallback to
1457       // DataWithoutLaneMask to try to vectorize the loop with folded tail
1458       // in a generic way.
1459       ChosenTailFoldingStyle =
1460           std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1461                          TailFoldingStyle::DataWithoutLaneMask);
1462       LLVM_DEBUG(
1463           dbgs()
1464           << "LV: Preference for VP intrinsics indicated. Will "
1465              "not try to generate VP Intrinsics "
1466           << (UserIC > 1
1467                   ? "since interleave count specified is greater than 1.\n"
1468                   : "due to non-interleaving reasons.\n"));
1469     }
1470   }
1471 
1472   /// Returns true if all loop blocks should be masked to fold tail loop.
1473   bool foldTailByMasking() const {
1474     // TODO: check if it is possible to check for None style independent of
1475     // IVUpdateMayOverflow flag in getTailFoldingStyle.
1476     return getTailFoldingStyle() != TailFoldingStyle::None;
1477   }
1478 
1479   /// Return maximum safe number of elements to be processed per vector
1480   /// iteration, which do not prevent store-load forwarding and are safe with
1481   /// regard to the memory dependencies. Required for EVL-based VPlans to
1482   /// correctly calculate AVL (application vector length) as min(remaining AVL,
1483   /// MaxSafeElements).
1484   /// TODO: need to consider adjusting cost model to use this value as a
1485   /// vectorization factor for EVL-based vectorization.
1486   std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1487 
1488   /// Returns true if the instructions in this block requires predication
1489   /// for any reason, e.g. because tail folding now requires a predicate
1490   /// or because the block in the original loop was predicated.
1491   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1492     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1493   }
1494 
1495   /// Returns true if VP intrinsics with explicit vector length support should
1496   /// be generated in the tail folded loop.
1497   bool foldTailWithEVL() const {
1498     return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1499   }
1500 
1501   /// Returns true if the Phi is part of an inloop reduction.
1502   bool isInLoopReduction(PHINode *Phi) const {
1503     return InLoopReductions.contains(Phi);
1504   }
1505 
1506   /// Returns true if the predicated reduction select should be used to set the
1507   /// incoming value for the reduction phi.
1508   bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1509     // Force to use predicated reduction select since the EVL of the
1510     // second-to-last iteration might not be VF*UF.
1511     if (foldTailWithEVL())
1512       return true;
1513     return PreferPredicatedReductionSelect ||
1514            TTI.preferPredicatedReductionSelect(
1515                Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1516   }
1517 
1518   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1519   /// with factor VF.  Return the cost of the instruction, including
1520   /// scalarization overhead if it's needed.
1521   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1522 
1523   /// Estimate cost of a call instruction CI if it were vectorized with factor
1524   /// VF. Return the cost of the instruction, including scalarization overhead
1525   /// if it's needed.
1526   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1527 
1528   /// Invalidates decisions already taken by the cost model.
1529   void invalidateCostModelingDecisions() {
1530     WideningDecisions.clear();
1531     CallWideningDecisions.clear();
1532     Uniforms.clear();
1533     Scalars.clear();
1534   }
1535 
1536   /// Returns the expected execution cost. The unit of the cost does
1537   /// not matter because we use the 'cost' units to compare different
1538   /// vector widths. The cost that is returned is *not* normalized by
1539   /// the factor width.
1540   InstructionCost expectedCost(ElementCount VF);
1541 
1542   bool hasPredStores() const { return NumPredStores > 0; }
1543 
1544   /// Returns true if epilogue vectorization is considered profitable, and
1545   /// false otherwise.
1546   /// \p VF is the vectorization factor chosen for the original loop.
1547   /// \p Multiplier is an aditional scaling factor applied to VF before
1548   /// comparing to EpilogueVectorizationMinVF.
1549   bool isEpilogueVectorizationProfitable(const ElementCount VF,
1550                                          const unsigned IC) const;
1551 
1552   /// Returns the execution time cost of an instruction for a given vector
1553   /// width. Vector width of one means scalar.
1554   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1555 
1556   /// Return the cost of instructions in an inloop reduction pattern, if I is
1557   /// part of that pattern.
1558   std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1559                                                          ElementCount VF,
1560                                                          Type *VectorTy) const;
1561 
1562   /// Returns true if \p Op should be considered invariant and if it is
1563   /// trivially hoistable.
1564   bool shouldConsiderInvariant(Value *Op);
1565 
1566 private:
1567   unsigned NumPredStores = 0;
1568 
1569   /// \return An upper bound for the vectorization factors for both
1570   /// fixed and scalable vectorization, where the minimum-known number of
1571   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1572   /// disabled or unsupported, then the scalable part will be equal to
1573   /// ElementCount::getScalable(0).
1574   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1575                                            ElementCount UserVF,
1576                                            bool FoldTailByMasking);
1577 
1578   /// \return the maximized element count based on the targets vector
1579   /// registers and the loop trip-count, but limited to a maximum safe VF.
1580   /// This is a helper function of computeFeasibleMaxVF.
1581   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1582                                        unsigned SmallestType,
1583                                        unsigned WidestType,
1584                                        ElementCount MaxSafeVF,
1585                                        bool FoldTailByMasking);
1586 
1587   /// Checks if scalable vectorization is supported and enabled. Caches the
1588   /// result to avoid repeated debug dumps for repeated queries.
1589   bool isScalableVectorizationAllowed();
1590 
1591   /// \return the maximum legal scalable VF, based on the safe max number
1592   /// of elements.
1593   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1594 
1595   /// Calculate vectorization cost of memory instruction \p I.
1596   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1597 
1598   /// The cost computation for scalarized memory instruction.
1599   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1600 
1601   /// The cost computation for interleaving group of memory instructions.
1602   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1603 
1604   /// The cost computation for Gather/Scatter instruction.
1605   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1606 
1607   /// The cost computation for widening instruction \p I with consecutive
1608   /// memory access.
1609   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1610 
1611   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1612   /// Load: scalar load + broadcast.
1613   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1614   /// element)
1615   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1616 
1617   /// Estimate the overhead of scalarizing an instruction. This is a
1618   /// convenience wrapper for the type-based getScalarizationOverhead API.
1619   InstructionCost getScalarizationOverhead(Instruction *I,
1620                                            ElementCount VF) const;
1621 
1622   /// Returns true if an artificially high cost for emulated masked memrefs
1623   /// should be used.
1624   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1625 
1626   /// Map of scalar integer values to the smallest bitwidth they can be legally
1627   /// represented as. The vector equivalents of these values should be truncated
1628   /// to this type.
1629   MapVector<Instruction *, uint64_t> MinBWs;
1630 
1631   /// A type representing the costs for instructions if they were to be
1632   /// scalarized rather than vectorized. The entries are Instruction-Cost
1633   /// pairs.
1634   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1635 
1636   /// A set containing all BasicBlocks that are known to present after
1637   /// vectorization as a predicated block.
1638   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1639       PredicatedBBsAfterVectorization;
1640 
1641   /// Records whether it is allowed to have the original scalar loop execute at
1642   /// least once. This may be needed as a fallback loop in case runtime
1643   /// aliasing/dependence checks fail, or to handle the tail/remainder
1644   /// iterations when the trip count is unknown or doesn't divide by the VF,
1645   /// or as a peel-loop to handle gaps in interleave-groups.
1646   /// Under optsize and when the trip count is very small we don't allow any
1647   /// iterations to execute in the scalar loop.
1648   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1649 
1650   /// Control finally chosen tail folding style. The first element is used if
1651   /// the IV update may overflow, the second element - if it does not.
1652   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1653       ChosenTailFoldingStyle;
1654 
1655   /// true if scalable vectorization is supported and enabled.
1656   std::optional<bool> IsScalableVectorizationAllowed;
1657 
1658   /// Maximum safe number of elements to be processed per vector iteration,
1659   /// which do not prevent store-load forwarding and are safe with regard to the
1660   /// memory dependencies. Required for EVL-based veectorization, where this
1661   /// value is used as the upper bound of the safe AVL.
1662   std::optional<unsigned> MaxSafeElements;
1663 
1664   /// A map holding scalar costs for different vectorization factors. The
1665   /// presence of a cost for an instruction in the mapping indicates that the
1666   /// instruction will be scalarized when vectorizing with the associated
1667   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1668   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1669 
1670   /// Holds the instructions known to be uniform after vectorization.
1671   /// The data is collected per VF.
1672   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1673 
1674   /// Holds the instructions known to be scalar after vectorization.
1675   /// The data is collected per VF.
1676   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1677 
1678   /// Holds the instructions (address computations) that are forced to be
1679   /// scalarized.
1680   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1681 
1682   /// PHINodes of the reductions that should be expanded in-loop.
1683   SmallPtrSet<PHINode *, 4> InLoopReductions;
1684 
1685   /// A Map of inloop reduction operations and their immediate chain operand.
1686   /// FIXME: This can be removed once reductions can be costed correctly in
1687   /// VPlan. This was added to allow quick lookup of the inloop operations.
1688   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1689 
1690   /// Returns the expected difference in cost from scalarizing the expression
1691   /// feeding a predicated instruction \p PredInst. The instructions to
1692   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1693   /// non-negative return value implies the expression will be scalarized.
1694   /// Currently, only single-use chains are considered for scalarization.
1695   InstructionCost computePredInstDiscount(Instruction *PredInst,
1696                                           ScalarCostsTy &ScalarCosts,
1697                                           ElementCount VF);
1698 
1699   /// Collect the instructions that are uniform after vectorization. An
1700   /// instruction is uniform if we represent it with a single scalar value in
1701   /// the vectorized loop corresponding to each vector iteration. Examples of
1702   /// uniform instructions include pointer operands of consecutive or
1703   /// interleaved memory accesses. Note that although uniformity implies an
1704   /// instruction will be scalar, the reverse is not true. In general, a
1705   /// scalarized instruction will be represented by VF scalar values in the
1706   /// vectorized loop, each corresponding to an iteration of the original
1707   /// scalar loop.
1708   void collectLoopUniforms(ElementCount VF);
1709 
1710   /// Collect the instructions that are scalar after vectorization. An
1711   /// instruction is scalar if it is known to be uniform or will be scalarized
1712   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1713   /// to the list if they are used by a load/store instruction that is marked as
1714   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1715   /// VF values in the vectorized loop, each corresponding to an iteration of
1716   /// the original scalar loop.
1717   void collectLoopScalars(ElementCount VF);
1718 
1719   /// Keeps cost model vectorization decision and cost for instructions.
1720   /// Right now it is used for memory instructions only.
1721   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1722                                 std::pair<InstWidening, InstructionCost>>;
1723 
1724   DecisionList WideningDecisions;
1725 
1726   using CallDecisionList =
1727       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1728 
1729   CallDecisionList CallWideningDecisions;
1730 
1731   /// Returns true if \p V is expected to be vectorized and it needs to be
1732   /// extracted.
1733   bool needsExtract(Value *V, ElementCount VF) const {
1734     Instruction *I = dyn_cast<Instruction>(V);
1735     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1736         TheLoop->isLoopInvariant(I) ||
1737         getWideningDecision(I, VF) == CM_Scalarize)
1738       return false;
1739 
1740     // Assume we can vectorize V (and hence we need extraction) if the
1741     // scalars are not computed yet. This can happen, because it is called
1742     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1743     // the scalars are collected. That should be a safe assumption in most
1744     // cases, because we check if the operands have vectorizable types
1745     // beforehand in LoopVectorizationLegality.
1746     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1747   };
1748 
1749   /// Returns a range containing only operands needing to be extracted.
1750   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1751                                                    ElementCount VF) const {
1752     return SmallVector<Value *, 4>(make_filter_range(
1753         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1754   }
1755 
1756 public:
1757   /// The loop that we evaluate.
1758   Loop *TheLoop;
1759 
1760   /// Predicated scalar evolution analysis.
1761   PredicatedScalarEvolution &PSE;
1762 
1763   /// Loop Info analysis.
1764   LoopInfo *LI;
1765 
1766   /// Vectorization legality.
1767   LoopVectorizationLegality *Legal;
1768 
1769   /// Vector target information.
1770   const TargetTransformInfo &TTI;
1771 
1772   /// Target Library Info.
1773   const TargetLibraryInfo *TLI;
1774 
1775   /// Demanded bits analysis.
1776   DemandedBits *DB;
1777 
1778   /// Assumption cache.
1779   AssumptionCache *AC;
1780 
1781   /// Interface to emit optimization remarks.
1782   OptimizationRemarkEmitter *ORE;
1783 
1784   const Function *TheFunction;
1785 
1786   /// Loop Vectorize Hint.
1787   const LoopVectorizeHints *Hints;
1788 
1789   /// The interleave access information contains groups of interleaved accesses
1790   /// with the same stride and close to each other.
1791   InterleavedAccessInfo &InterleaveInfo;
1792 
1793   /// Values to ignore in the cost model.
1794   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1795 
1796   /// Values to ignore in the cost model when VF > 1.
1797   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1798 
1799   /// All element types found in the loop.
1800   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1801 
1802   /// The kind of cost that we are calculating
1803   TTI::TargetCostKind CostKind;
1804 };
1805 } // end namespace llvm
1806 
1807 namespace {
1808 /// Helper struct to manage generating runtime checks for vectorization.
1809 ///
1810 /// The runtime checks are created up-front in temporary blocks to allow better
1811 /// estimating the cost and un-linked from the existing IR. After deciding to
1812 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1813 /// temporary blocks are completely removed.
1814 class GeneratedRTChecks {
1815   /// Basic block which contains the generated SCEV checks, if any.
1816   BasicBlock *SCEVCheckBlock = nullptr;
1817 
1818   /// The value representing the result of the generated SCEV checks. If it is
1819   /// nullptr, either no SCEV checks have been generated or they have been used.
1820   Value *SCEVCheckCond = nullptr;
1821 
1822   /// Basic block which contains the generated memory runtime checks, if any.
1823   BasicBlock *MemCheckBlock = nullptr;
1824 
1825   /// The value representing the result of the generated memory runtime checks.
1826   /// If it is nullptr, either no memory runtime checks have been generated or
1827   /// they have been used.
1828   Value *MemRuntimeCheckCond = nullptr;
1829 
1830   DominatorTree *DT;
1831   LoopInfo *LI;
1832   TargetTransformInfo *TTI;
1833 
1834   SCEVExpander SCEVExp;
1835   SCEVExpander MemCheckExp;
1836 
1837   bool CostTooHigh = false;
1838   const bool AddBranchWeights;
1839 
1840   Loop *OuterLoop = nullptr;
1841 
1842   PredicatedScalarEvolution &PSE;
1843 
1844   /// The kind of cost that we are calculating
1845   TTI::TargetCostKind CostKind;
1846 
1847 public:
1848   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1849                     LoopInfo *LI, TargetTransformInfo *TTI,
1850                     const DataLayout &DL, bool AddBranchWeights,
1851                     TTI::TargetCostKind CostKind)
1852       : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1853         MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1854         AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1855 
1856   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1857   /// accurately estimate the cost of the runtime checks. The blocks are
1858   /// un-linked from the IR and are added back during vector code generation. If
1859   /// there is no vector code generation, the check blocks are removed
1860   /// completely.
1861   void create(Loop *L, const LoopAccessInfo &LAI,
1862               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1863 
1864     // Hard cutoff to limit compile-time increase in case a very large number of
1865     // runtime checks needs to be generated.
1866     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1867     // profile info.
1868     CostTooHigh =
1869         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1870     if (CostTooHigh)
1871       return;
1872 
1873     BasicBlock *LoopHeader = L->getHeader();
1874     BasicBlock *Preheader = L->getLoopPreheader();
1875 
1876     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1877     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1878     // may be used by SCEVExpander. The blocks will be un-linked from their
1879     // predecessors and removed from LI & DT at the end of the function.
1880     if (!UnionPred.isAlwaysTrue()) {
1881       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1882                                   nullptr, "vector.scevcheck");
1883 
1884       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1885           &UnionPred, SCEVCheckBlock->getTerminator());
1886     }
1887 
1888     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1889     if (RtPtrChecking.Need) {
1890       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1891       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1892                                  "vector.memcheck");
1893 
1894       auto DiffChecks = RtPtrChecking.getDiffChecks();
1895       if (DiffChecks) {
1896         Value *RuntimeVF = nullptr;
1897         MemRuntimeCheckCond = addDiffRuntimeChecks(
1898             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1899             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1900               if (!RuntimeVF)
1901                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1902               return RuntimeVF;
1903             },
1904             IC);
1905       } else {
1906         MemRuntimeCheckCond = addRuntimeChecks(
1907             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1908             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1909       }
1910       assert(MemRuntimeCheckCond &&
1911              "no RT checks generated although RtPtrChecking "
1912              "claimed checks are required");
1913     }
1914 
1915     if (!MemCheckBlock && !SCEVCheckBlock)
1916       return;
1917 
1918     // Unhook the temporary block with the checks, update various places
1919     // accordingly.
1920     if (SCEVCheckBlock)
1921       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1922     if (MemCheckBlock)
1923       MemCheckBlock->replaceAllUsesWith(Preheader);
1924 
1925     if (SCEVCheckBlock) {
1926       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1927       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1928       Preheader->getTerminator()->eraseFromParent();
1929     }
1930     if (MemCheckBlock) {
1931       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1932       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1933       Preheader->getTerminator()->eraseFromParent();
1934     }
1935 
1936     DT->changeImmediateDominator(LoopHeader, Preheader);
1937     if (MemCheckBlock) {
1938       DT->eraseNode(MemCheckBlock);
1939       LI->removeBlock(MemCheckBlock);
1940     }
1941     if (SCEVCheckBlock) {
1942       DT->eraseNode(SCEVCheckBlock);
1943       LI->removeBlock(SCEVCheckBlock);
1944     }
1945 
1946     // Outer loop is used as part of the later cost calculations.
1947     OuterLoop = L->getParentLoop();
1948   }
1949 
1950   InstructionCost getCost() {
1951     if (SCEVCheckBlock || MemCheckBlock)
1952       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1953 
1954     if (CostTooHigh) {
1955       InstructionCost Cost;
1956       Cost.setInvalid();
1957       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1958       return Cost;
1959     }
1960 
1961     InstructionCost RTCheckCost = 0;
1962     if (SCEVCheckBlock)
1963       for (Instruction &I : *SCEVCheckBlock) {
1964         if (SCEVCheckBlock->getTerminator() == &I)
1965           continue;
1966         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1967         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1968         RTCheckCost += C;
1969       }
1970     if (MemCheckBlock) {
1971       InstructionCost MemCheckCost = 0;
1972       for (Instruction &I : *MemCheckBlock) {
1973         if (MemCheckBlock->getTerminator() == &I)
1974           continue;
1975         InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1976         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1977         MemCheckCost += C;
1978       }
1979 
1980       // If the runtime memory checks are being created inside an outer loop
1981       // we should find out if these checks are outer loop invariant. If so,
1982       // the checks will likely be hoisted out and so the effective cost will
1983       // reduce according to the outer loop trip count.
1984       if (OuterLoop) {
1985         ScalarEvolution *SE = MemCheckExp.getSE();
1986         // TODO: If profitable, we could refine this further by analysing every
1987         // individual memory check, since there could be a mixture of loop
1988         // variant and invariant checks that mean the final condition is
1989         // variant.
1990         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1991         if (SE->isLoopInvariant(Cond, OuterLoop)) {
1992           // It seems reasonable to assume that we can reduce the effective
1993           // cost of the checks even when we know nothing about the trip
1994           // count. Assume that the outer loop executes at least twice.
1995           unsigned BestTripCount = 2;
1996 
1997           // Get the best known TC estimate.
1998           if (auto EstimatedTC = getSmallBestKnownTC(
1999                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
2000             BestTripCount = *EstimatedTC;
2001 
2002           BestTripCount = std::max(BestTripCount, 1U);
2003           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2004 
2005           // Let's ensure the cost is always at least 1.
2006           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2007                                      (InstructionCost::CostType)1);
2008 
2009           if (BestTripCount > 1)
2010             LLVM_DEBUG(dbgs()
2011                        << "We expect runtime memory checks to be hoisted "
2012                        << "out of the outer loop. Cost reduced from "
2013                        << MemCheckCost << " to " << NewMemCheckCost << '\n');
2014 
2015           MemCheckCost = NewMemCheckCost;
2016         }
2017       }
2018 
2019       RTCheckCost += MemCheckCost;
2020     }
2021 
2022     if (SCEVCheckBlock || MemCheckBlock)
2023       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2024                         << "\n");
2025 
2026     return RTCheckCost;
2027   }
2028 
2029   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2030   /// unused.
2031   ~GeneratedRTChecks() {
2032     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2033     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2034     if (!SCEVCheckCond)
2035       SCEVCleaner.markResultUsed();
2036 
2037     if (!MemRuntimeCheckCond)
2038       MemCheckCleaner.markResultUsed();
2039 
2040     if (MemRuntimeCheckCond) {
2041       auto &SE = *MemCheckExp.getSE();
2042       // Memory runtime check generation creates compares that use expanded
2043       // values. Remove them before running the SCEVExpanderCleaners.
2044       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2045         if (MemCheckExp.isInsertedInstruction(&I))
2046           continue;
2047         SE.forgetValue(&I);
2048         I.eraseFromParent();
2049       }
2050     }
2051     MemCheckCleaner.cleanup();
2052     SCEVCleaner.cleanup();
2053 
2054     if (SCEVCheckCond)
2055       SCEVCheckBlock->eraseFromParent();
2056     if (MemRuntimeCheckCond)
2057       MemCheckBlock->eraseFromParent();
2058   }
2059 
2060   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2061   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2062   /// depending on the generated condition.
2063   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2064                              BasicBlock *LoopVectorPreHeader) {
2065     if (!SCEVCheckCond)
2066       return nullptr;
2067 
2068     Value *Cond = SCEVCheckCond;
2069     // Mark the check as used, to prevent it from being removed during cleanup.
2070     SCEVCheckCond = nullptr;
2071     if (auto *C = dyn_cast<ConstantInt>(Cond))
2072       if (C->isZero())
2073         return nullptr;
2074 
2075     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2076 
2077     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2078     // Create new preheader for vector loop.
2079     if (OuterLoop)
2080       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2081 
2082     SCEVCheckBlock->getTerminator()->eraseFromParent();
2083     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2084     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2085                                                 SCEVCheckBlock);
2086 
2087     DT->addNewBlock(SCEVCheckBlock, Pred);
2088     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2089 
2090     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2091     if (AddBranchWeights)
2092       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2093     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2094     return SCEVCheckBlock;
2095   }
2096 
2097   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2098   /// the branches to branch to the vector preheader or \p Bypass, depending on
2099   /// the generated condition.
2100   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2101                                    BasicBlock *LoopVectorPreHeader) {
2102     // Check if we generated code that checks in runtime if arrays overlap.
2103     if (!MemRuntimeCheckCond)
2104       return nullptr;
2105 
2106     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2107     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2108                                                 MemCheckBlock);
2109 
2110     DT->addNewBlock(MemCheckBlock, Pred);
2111     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2112     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2113 
2114     if (OuterLoop)
2115       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2116 
2117     BranchInst &BI =
2118         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2119     if (AddBranchWeights) {
2120       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2121     }
2122     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2123     MemCheckBlock->getTerminator()->setDebugLoc(
2124         Pred->getTerminator()->getDebugLoc());
2125 
2126     // Mark the check as used, to prevent it from being removed during cleanup.
2127     MemRuntimeCheckCond = nullptr;
2128     return MemCheckBlock;
2129   }
2130 };
2131 } // namespace
2132 
2133 static bool useActiveLaneMask(TailFoldingStyle Style) {
2134   return Style == TailFoldingStyle::Data ||
2135          Style == TailFoldingStyle::DataAndControlFlow ||
2136          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2137 }
2138 
2139 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2140   return Style == TailFoldingStyle::DataAndControlFlow ||
2141          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2142 }
2143 
2144 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2145 // vectorization. The loop needs to be annotated with #pragma omp simd
2146 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2147 // vector length information is not provided, vectorization is not considered
2148 // explicit. Interleave hints are not allowed either. These limitations will be
2149 // relaxed in the future.
2150 // Please, note that we are currently forced to abuse the pragma 'clang
2151 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2152 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2153 // provides *explicit vectorization hints* (LV can bypass legal checks and
2154 // assume that vectorization is legal). However, both hints are implemented
2155 // using the same metadata (llvm.loop.vectorize, processed by
2156 // LoopVectorizeHints). This will be fixed in the future when the native IR
2157 // representation for pragma 'omp simd' is introduced.
2158 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2159                                    OptimizationRemarkEmitter *ORE) {
2160   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2161   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2162 
2163   // Only outer loops with an explicit vectorization hint are supported.
2164   // Unannotated outer loops are ignored.
2165   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2166     return false;
2167 
2168   Function *Fn = OuterLp->getHeader()->getParent();
2169   if (!Hints.allowVectorization(Fn, OuterLp,
2170                                 true /*VectorizeOnlyWhenForced*/)) {
2171     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2172     return false;
2173   }
2174 
2175   if (Hints.getInterleave() > 1) {
2176     // TODO: Interleave support is future work.
2177     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2178                          "outer loops.\n");
2179     Hints.emitRemarkWithHints();
2180     return false;
2181   }
2182 
2183   return true;
2184 }
2185 
2186 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2187                                   OptimizationRemarkEmitter *ORE,
2188                                   SmallVectorImpl<Loop *> &V) {
2189   // Collect inner loops and outer loops without irreducible control flow. For
2190   // now, only collect outer loops that have explicit vectorization hints. If we
2191   // are stress testing the VPlan H-CFG construction, we collect the outermost
2192   // loop of every loop nest.
2193   if (L.isInnermost() || VPlanBuildStressTest ||
2194       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2195     LoopBlocksRPO RPOT(&L);
2196     RPOT.perform(LI);
2197     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2198       V.push_back(&L);
2199       // TODO: Collect inner loops inside marked outer loops in case
2200       // vectorization fails for the outer loop. Do not invoke
2201       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2202       // already known to be reducible. We can use an inherited attribute for
2203       // that.
2204       return;
2205     }
2206   }
2207   for (Loop *InnerL : L)
2208     collectSupportedLoops(*InnerL, LI, ORE, V);
2209 }
2210 
2211 //===----------------------------------------------------------------------===//
2212 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2213 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2214 //===----------------------------------------------------------------------===//
2215 
2216 /// Compute the transformed value of Index at offset StartValue using step
2217 /// StepValue.
2218 /// For integer induction, returns StartValue + Index * StepValue.
2219 /// For pointer induction, returns StartValue[Index * StepValue].
2220 /// FIXME: The newly created binary instructions should contain nsw/nuw
2221 /// flags, which can be found from the original scalar operations.
2222 static Value *
2223 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2224                      Value *Step,
2225                      InductionDescriptor::InductionKind InductionKind,
2226                      const BinaryOperator *InductionBinOp) {
2227   Type *StepTy = Step->getType();
2228   Value *CastedIndex = StepTy->isIntegerTy()
2229                            ? B.CreateSExtOrTrunc(Index, StepTy)
2230                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2231   if (CastedIndex != Index) {
2232     CastedIndex->setName(CastedIndex->getName() + ".cast");
2233     Index = CastedIndex;
2234   }
2235 
2236   // Note: the IR at this point is broken. We cannot use SE to create any new
2237   // SCEV and then expand it, hoping that SCEV's simplification will give us
2238   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2239   // lead to various SCEV crashes. So all we can do is to use builder and rely
2240   // on InstCombine for future simplifications. Here we handle some trivial
2241   // cases only.
2242   auto CreateAdd = [&B](Value *X, Value *Y) {
2243     assert(X->getType() == Y->getType() && "Types don't match!");
2244     if (auto *CX = dyn_cast<ConstantInt>(X))
2245       if (CX->isZero())
2246         return Y;
2247     if (auto *CY = dyn_cast<ConstantInt>(Y))
2248       if (CY->isZero())
2249         return X;
2250     return B.CreateAdd(X, Y);
2251   };
2252 
2253   // We allow X to be a vector type, in which case Y will potentially be
2254   // splatted into a vector with the same element count.
2255   auto CreateMul = [&B](Value *X, Value *Y) {
2256     assert(X->getType()->getScalarType() == Y->getType() &&
2257            "Types don't match!");
2258     if (auto *CX = dyn_cast<ConstantInt>(X))
2259       if (CX->isOne())
2260         return Y;
2261     if (auto *CY = dyn_cast<ConstantInt>(Y))
2262       if (CY->isOne())
2263         return X;
2264     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2265     if (XVTy && !isa<VectorType>(Y->getType()))
2266       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2267     return B.CreateMul(X, Y);
2268   };
2269 
2270   switch (InductionKind) {
2271   case InductionDescriptor::IK_IntInduction: {
2272     assert(!isa<VectorType>(Index->getType()) &&
2273            "Vector indices not supported for integer inductions yet");
2274     assert(Index->getType() == StartValue->getType() &&
2275            "Index type does not match StartValue type");
2276     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2277       return B.CreateSub(StartValue, Index);
2278     auto *Offset = CreateMul(Index, Step);
2279     return CreateAdd(StartValue, Offset);
2280   }
2281   case InductionDescriptor::IK_PtrInduction:
2282     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2283   case InductionDescriptor::IK_FpInduction: {
2284     assert(!isa<VectorType>(Index->getType()) &&
2285            "Vector indices not supported for FP inductions yet");
2286     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2287     assert(InductionBinOp &&
2288            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2289             InductionBinOp->getOpcode() == Instruction::FSub) &&
2290            "Original bin op should be defined for FP induction");
2291 
2292     Value *MulExp = B.CreateFMul(Step, Index);
2293     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2294                          "induction");
2295   }
2296   case InductionDescriptor::IK_NoInduction:
2297     return nullptr;
2298   }
2299   llvm_unreachable("invalid enum");
2300 }
2301 
2302 std::optional<unsigned> getMaxVScale(const Function &F,
2303                                      const TargetTransformInfo &TTI) {
2304   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2305     return MaxVScale;
2306 
2307   if (F.hasFnAttribute(Attribute::VScaleRange))
2308     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2309 
2310   return std::nullopt;
2311 }
2312 
2313 /// For the given VF and UF and maximum trip count computed for the loop, return
2314 /// whether the induction variable might overflow in the vectorized loop. If not,
2315 /// then we know a runtime overflow check always evaluates to false and can be
2316 /// removed.
2317 static bool isIndvarOverflowCheckKnownFalse(
2318     const LoopVectorizationCostModel *Cost,
2319     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2320   // Always be conservative if we don't know the exact unroll factor.
2321   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2322 
2323   Type *IdxTy = Cost->Legal->getWidestInductionType();
2324   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2325 
2326   // We know the runtime overflow check is known false iff the (max) trip-count
2327   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2328   // the vector loop induction variable.
2329   if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2330     uint64_t MaxVF = VF.getKnownMinValue();
2331     if (VF.isScalable()) {
2332       std::optional<unsigned> MaxVScale =
2333           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2334       if (!MaxVScale)
2335         return false;
2336       MaxVF *= *MaxVScale;
2337     }
2338 
2339     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2340   }
2341 
2342   return false;
2343 }
2344 
2345 // Return whether we allow using masked interleave-groups (for dealing with
2346 // strided loads/stores that reside in predicated blocks, or for dealing
2347 // with gaps).
2348 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2349   // If an override option has been passed in for interleaved accesses, use it.
2350   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2351     return EnableMaskedInterleavedMemAccesses;
2352 
2353   return TTI.enableMaskedInterleavedAccessVectorization();
2354 }
2355 
2356 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2357                                                VPReplicateRecipe *RepRecipe,
2358                                                const VPLane &Lane,
2359                                                VPTransformState &State) {
2360   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2361 
2362   // Does this instruction return a value ?
2363   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2364 
2365   Instruction *Cloned = Instr->clone();
2366   if (!IsVoidRetTy) {
2367     Cloned->setName(Instr->getName() + ".cloned");
2368 #if !defined(NDEBUG)
2369     // Verify that VPlan type inference results agree with the type of the
2370     // generated values.
2371     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2372            "inferred type and type from generated instructions do not match");
2373 #endif
2374   }
2375 
2376   RepRecipe->setFlags(Cloned);
2377 
2378   if (auto DL = Instr->getDebugLoc())
2379     State.setDebugLocFrom(DL);
2380 
2381   // Replace the operands of the cloned instructions with their scalar
2382   // equivalents in the new loop.
2383   for (const auto &I : enumerate(RepRecipe->operands())) {
2384     auto InputLane = Lane;
2385     VPValue *Operand = I.value();
2386     if (vputils::isUniformAfterVectorization(Operand))
2387       InputLane = VPLane::getFirstLane();
2388     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2389   }
2390   State.addNewMetadata(Cloned, Instr);
2391 
2392   // Place the cloned scalar in the new loop.
2393   State.Builder.Insert(Cloned);
2394 
2395   State.set(RepRecipe, Cloned, Lane);
2396 
2397   // If we just cloned a new assumption, add it the assumption cache.
2398   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2399     AC->registerAssumption(II);
2400 
2401   // End if-block.
2402   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2403   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2404   assert(
2405       (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2406        all_of(RepRecipe->operands(),
2407               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2408       "Expected a recipe is either within a region or all of its operands "
2409       "are defined outside the vectorized region.");
2410   if (IfPredicateInstr)
2411     PredicatedInstructions.push_back(Cloned);
2412 }
2413 
2414 Value *
2415 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2416   if (VectorTripCount)
2417     return VectorTripCount;
2418 
2419   Value *TC = getTripCount();
2420   IRBuilder<> Builder(InsertBlock->getTerminator());
2421 
2422   Type *Ty = TC->getType();
2423   // This is where we can make the step a runtime constant.
2424   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2425 
2426   // If the tail is to be folded by masking, round the number of iterations N
2427   // up to a multiple of Step instead of rounding down. This is done by first
2428   // adding Step-1 and then rounding down. Note that it's ok if this addition
2429   // overflows: the vector induction variable will eventually wrap to zero given
2430   // that it starts at zero and its Step is a power of two; the loop will then
2431   // exit, with the last early-exit vector comparison also producing all-true.
2432   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2433   // is accounted for in emitIterationCountCheck that adds an overflow check.
2434   if (Cost->foldTailByMasking()) {
2435     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2436            "VF*UF must be a power of 2 when folding tail by masking");
2437     TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2438                            "n.rnd.up");
2439   }
2440 
2441   // Now we need to generate the expression for the part of the loop that the
2442   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2443   // iterations are not required for correctness, or N - Step, otherwise. Step
2444   // is equal to the vectorization factor (number of SIMD elements) times the
2445   // unroll factor (number of SIMD instructions).
2446   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2447 
2448   // There are cases where we *must* run at least one iteration in the remainder
2449   // loop.  See the cost model for when this can happen.  If the step evenly
2450   // divides the trip count, we set the remainder to be equal to the step. If
2451   // the step does not evenly divide the trip count, no adjustment is necessary
2452   // since there will already be scalar iterations. Note that the minimum
2453   // iterations check ensures that N >= Step.
2454   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2455     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2456     R = Builder.CreateSelect(IsZero, Step, R);
2457   }
2458 
2459   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2460 
2461   return VectorTripCount;
2462 }
2463 
2464 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2465   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2466   VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2467   if (PreVectorPH->getNumSuccessors() != 1) {
2468     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2469     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2470            "Unexpected successor");
2471     VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2472     VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2473     PreVectorPH = CheckVPIRBB;
2474   }
2475   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2476   PreVectorPH->swapSuccessors();
2477 }
2478 
2479 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2480   Value *Count = getTripCount();
2481   // Reuse existing vector loop preheader for TC checks.
2482   // Note that new preheader block is generated for vector loop.
2483   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2484   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2485 
2486   // Generate code to check if the loop's trip count is less than VF * UF, or
2487   // equal to it in case a scalar epilogue is required; this implies that the
2488   // vector trip count is zero. This check also covers the case where adding one
2489   // to the backedge-taken count overflowed leading to an incorrect trip count
2490   // of zero. In this case we will also jump to the scalar loop.
2491   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2492                                                        : ICmpInst::ICMP_ULT;
2493 
2494   // If tail is to be folded, vector loop takes care of all iterations.
2495   Type *CountTy = Count->getType();
2496   Value *CheckMinIters = Builder.getFalse();
2497   auto CreateStep = [&]() -> Value * {
2498     // Create step with max(MinProTripCount, UF * VF).
2499     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2500       return createStepForVF(Builder, CountTy, VF, UF);
2501 
2502     Value *MinProfTC =
2503         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2504     if (!VF.isScalable())
2505       return MinProfTC;
2506     return Builder.CreateBinaryIntrinsic(
2507         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2508   };
2509 
2510   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2511   if (Style == TailFoldingStyle::None) {
2512     Value *Step = CreateStep();
2513     ScalarEvolution &SE = *PSE.getSE();
2514     // TODO: Emit unconditional branch to vector preheader instead of
2515     // conditional branch with known condition.
2516     const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2517     // Check if the trip count is < the step.
2518     if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2519       // TODO: Ensure step is at most the trip count when determining max VF and
2520       // UF, w/o tail folding.
2521       CheckMinIters = Builder.getTrue();
2522     } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),
2523                                     TripCountSCEV, SE.getSCEV(Step))) {
2524       // Generate the minimum iteration check only if we cannot prove the
2525       // check is known to be true, or known to be false.
2526       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2527     } // else step known to be < trip count, use CheckMinIters preset to false.
2528   } else if (VF.isScalable() &&
2529              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2530              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2531     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2532     // an overflow to zero when updating induction variables and so an
2533     // additional overflow check is required before entering the vector loop.
2534 
2535     // Get the maximum unsigned value for the type.
2536     Value *MaxUIntTripCount =
2537         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2538     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2539 
2540     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2541     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2542   }
2543 
2544   // Create new preheader for vector loop.
2545   LoopVectorPreHeader =
2546       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2547                  "vector.ph");
2548 
2549   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2550                                DT->getNode(Bypass)->getIDom()) &&
2551          "TC check is expected to dominate Bypass");
2552 
2553   BranchInst &BI =
2554       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2555   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2556     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2557   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2558   LoopBypassBlocks.push_back(TCCheckBlock);
2559 
2560   // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2561   introduceCheckBlockInVPlan(TCCheckBlock);
2562 }
2563 
2564 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2565   BasicBlock *const SCEVCheckBlock =
2566       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2567   if (!SCEVCheckBlock)
2568     return nullptr;
2569 
2570   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2571            (OptForSizeBasedOnProfile &&
2572             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2573          "Cannot SCEV check stride or overflow when optimizing for size");
2574   assert(!LoopBypassBlocks.empty() &&
2575          "Should already be a bypass block due to iteration count check");
2576   LoopBypassBlocks.push_back(SCEVCheckBlock);
2577   AddedSafetyChecks = true;
2578 
2579   introduceCheckBlockInVPlan(SCEVCheckBlock);
2580   return SCEVCheckBlock;
2581 }
2582 
2583 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2584   // VPlan-native path does not do any analysis for runtime checks currently.
2585   if (EnableVPlanNativePath)
2586     return nullptr;
2587 
2588   BasicBlock *const MemCheckBlock =
2589       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2590 
2591   // Check if we generated code that checks in runtime if arrays overlap. We put
2592   // the checks into a separate block to make the more common case of few
2593   // elements faster.
2594   if (!MemCheckBlock)
2595     return nullptr;
2596 
2597   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2598     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2599            "Cannot emit memory checks when optimizing for size, unless forced "
2600            "to vectorize.");
2601     ORE->emit([&]() {
2602       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2603                                         OrigLoop->getStartLoc(),
2604                                         OrigLoop->getHeader())
2605              << "Code-size may be reduced by not forcing "
2606                 "vectorization, or by source-code modifications "
2607                 "eliminating the need for runtime checks "
2608                 "(e.g., adding 'restrict').";
2609     });
2610   }
2611 
2612   LoopBypassBlocks.push_back(MemCheckBlock);
2613 
2614   AddedSafetyChecks = true;
2615 
2616   introduceCheckBlockInVPlan(MemCheckBlock);
2617   return MemCheckBlock;
2618 }
2619 
2620 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2621 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2622 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2623 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2624 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2625   VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2626   for (auto &R : make_early_inc_range(*VPBB)) {
2627     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2628     R.moveBefore(*IRVPBB, IRVPBB->end());
2629   }
2630 
2631   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2632   // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2633 }
2634 
2635 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2636   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2637   assert(LoopVectorPreHeader && "Invalid loop structure");
2638   assert((OrigLoop->getUniqueLatchExitBlock() ||
2639           Cost->requiresScalarEpilogue(VF.isVector())) &&
2640          "loops not exiting via the latch without required epilogue?");
2641 
2642   LoopMiddleBlock =
2643       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2644                  LI, nullptr, Twine(Prefix) + "middle.block");
2645   replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
2646   LoopScalarPreHeader =
2647       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2648                  nullptr, Twine(Prefix) + "scalar.ph");
2649   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
2650 }
2651 
2652 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2653 /// expansion results.
2654 static Value *getExpandedStep(const InductionDescriptor &ID,
2655                               const SCEV2ValueTy &ExpandedSCEVs) {
2656   const SCEV *Step = ID.getStep();
2657   if (auto *C = dyn_cast<SCEVConstant>(Step))
2658     return C->getValue();
2659   if (auto *U = dyn_cast<SCEVUnknown>(Step))
2660     return U->getValue();
2661   auto I = ExpandedSCEVs.find(Step);
2662   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2663   return I->second;
2664 }
2665 
2666 /// Knowing that loop \p L executes a single vector iteration, add instructions
2667 /// that will get simplified and thus should not have any cost to \p
2668 /// InstsToIgnore.
2669 static void addFullyUnrolledInstructionsToIgnore(
2670     Loop *L, const LoopVectorizationLegality::InductionList &IL,
2671     SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2672   auto *Cmp = L->getLatchCmpInst();
2673   if (Cmp)
2674     InstsToIgnore.insert(Cmp);
2675   for (const auto &KV : IL) {
2676     // Extract the key by hand so that it can be used in the lambda below.  Note
2677     // that captured structured bindings are a C++20 extension.
2678     const PHINode *IV = KV.first;
2679 
2680     // Get next iteration value of the induction variable.
2681     Instruction *IVInst =
2682         cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2683     if (all_of(IVInst->users(),
2684                [&](const User *U) { return U == IV || U == Cmp; }))
2685       InstsToIgnore.insert(IVInst);
2686   }
2687 }
2688 
2689 void InnerLoopVectorizer::createInductionAdditionalBypassValues(
2690     const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2691   assert(MainVectorTripCount && "Must have bypass information");
2692 
2693   Instruction *OldInduction = Legal->getPrimaryInduction();
2694   IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2695                             getAdditionalBypassBlock()->getFirstInsertionPt());
2696   for (const auto &InductionEntry : Legal->getInductionVars()) {
2697     PHINode *OrigPhi = InductionEntry.first;
2698     const InductionDescriptor &II = InductionEntry.second;
2699     Value *Step = getExpandedStep(II, ExpandedSCEVs);
2700     // For the primary induction the additional bypass end value is known.
2701     // Otherwise it is computed.
2702     Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2703     if (OrigPhi != OldInduction) {
2704       auto *BinOp = II.getInductionBinOp();
2705       // Fast-math-flags propagate from the original induction instruction.
2706       if (isa_and_nonnull<FPMathOperator>(BinOp))
2707         BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2708 
2709       // Compute the end value for the additional bypass.
2710       EndValueFromAdditionalBypass =
2711           emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2712                                II.getStartValue(), Step, II.getKind(), BinOp);
2713       EndValueFromAdditionalBypass->setName("ind.end");
2714     }
2715 
2716     // Store the bypass value here, as it needs to be added as operand to its
2717     // scalar preheader phi node after the epilogue skeleton has been created.
2718     // TODO: Directly add as extra operand to the VPResumePHI recipe.
2719     assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2720            "entry for OrigPhi already exits");
2721     Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2722   }
2723 }
2724 
2725 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2726     const SCEV2ValueTy &ExpandedSCEVs) {
2727   /*
2728    In this function we generate a new loop. The new loop will contain
2729    the vectorized instructions while the old loop will continue to run the
2730    scalar remainder.
2731 
2732        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2733      /  |      preheader are expanded here. Eventually all required SCEV
2734     /   |      expansion should happen here.
2735    /    v
2736   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2737   |  /  |
2738   | /   v
2739   ||   [ ]     <-- vector pre header.
2740   |/    |
2741   |     v
2742   |    [  ] \
2743   |    [  ]_|   <-- vector loop (created during VPlan execution).
2744   |     |
2745   |     v
2746   \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2747    |    |                       successors created during VPlan execution)
2748    \/   |
2749    /\   v
2750    | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
2751    |    |
2752  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
2753    |   [ ] \
2754    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue, header
2755    |    |          wrapped in VPIRBasicBlock).
2756     \   |
2757      \  v
2758       >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
2759    ...
2760    */
2761 
2762   // Create an empty vector loop, and prepare basic blocks for the runtime
2763   // checks.
2764   createVectorLoopSkeleton("");
2765 
2766   // Now, compare the new count to zero. If it is zero skip the vector loop and
2767   // jump to the scalar loop. This check also covers the case where the
2768   // backedge-taken count is uint##_max: adding one to it will overflow leading
2769   // to an incorrect trip count of zero. In this (rare) case we will also jump
2770   // to the scalar loop.
2771   emitIterationCountCheck(LoopScalarPreHeader);
2772 
2773   // Generate the code to check any assumptions that we've made for SCEV
2774   // expressions.
2775   emitSCEVChecks(LoopScalarPreHeader);
2776 
2777   // Generate the code that checks in runtime if arrays overlap. We put the
2778   // checks into a separate block to make the more common case of few elements
2779   // faster.
2780   emitMemRuntimeChecks(LoopScalarPreHeader);
2781 
2782   return LoopVectorPreHeader;
2783 }
2784 
2785 // Fix up external users of the induction variable. At this point, we are
2786 // in LCSSA form, with all external PHIs that use the IV having one input value,
2787 // coming from the remainder loop. We need those PHIs to also have a correct
2788 // value for the IV when arriving directly from the middle block.
2789 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2790                                        const InductionDescriptor &II,
2791                                        Value *VectorTripCount,
2792                                        BasicBlock *MiddleBlock,
2793                                        VPTransformState &State) {
2794   // There are two kinds of external IV usages - those that use the value
2795   // computed in the last iteration (the PHI) and those that use the penultimate
2796   // value (the value that feeds into the phi from the loop latch).
2797   // We allow both, but they, obviously, have different values.
2798 
2799   DenseMap<Value *, Value *> MissingVals;
2800 
2801   Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
2802                                       OrigLoop->getLoopPreheader()))
2803                         ->getIncomingValueForBlock(MiddleBlock);
2804 
2805   // An external user of the last iteration's value should see the value that
2806   // the remainder loop uses to initialize its own IV.
2807   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2808   for (User *U : PostInc->users()) {
2809     Instruction *UI = cast<Instruction>(U);
2810     if (!OrigLoop->contains(UI)) {
2811       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2812       MissingVals[UI] = EndValue;
2813     }
2814   }
2815 
2816   // An external user of the penultimate value need to see EndValue - Step.
2817   // The simplest way to get this is to recompute it from the constituent SCEVs,
2818   // that is Start + (Step * (CRD - 1)).
2819   for (User *U : OrigPhi->users()) {
2820     auto *UI = cast<Instruction>(U);
2821     if (!OrigLoop->contains(UI)) {
2822       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2823       IRBuilder<> B(MiddleBlock->getTerminator());
2824 
2825       // Fast-math-flags propagate from the original induction instruction.
2826       if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
2827         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2828 
2829       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2830       assert(StepVPV && "step must have been expanded during VPlan execution");
2831       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2832                                         : State.get(StepVPV, VPLane(0));
2833       Value *Escape = nullptr;
2834       if (EndValue->getType()->isIntegerTy())
2835         Escape = B.CreateSub(EndValue, Step);
2836       else if (EndValue->getType()->isPointerTy())
2837         Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
2838       else {
2839         assert(EndValue->getType()->isFloatingPointTy() &&
2840                "Unexpected induction type");
2841         Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
2842                                        Instruction::FAdd
2843                                    ? Instruction::FSub
2844                                    : Instruction::FAdd,
2845                                EndValue, Step);
2846       }
2847       Escape->setName("ind.escape");
2848       MissingVals[UI] = Escape;
2849     }
2850   }
2851 
2852   assert((MissingVals.empty() ||
2853           all_of(MissingVals,
2854                  [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
2855                    return all_of(
2856                        predecessors(cast<Instruction>(P.first)->getParent()),
2857                        [MiddleBlock, this](BasicBlock *Pred) {
2858                          return Pred == MiddleBlock ||
2859                                 Pred == OrigLoop->getLoopLatch();
2860                        });
2861                  })) &&
2862          "Expected escaping values from latch/middle.block only");
2863 
2864   for (auto &I : MissingVals) {
2865     PHINode *PHI = cast<PHINode>(I.first);
2866     // One corner case we have to handle is two IVs "chasing" each-other,
2867     // that is %IV2 = phi [...], [ %IV1, %latch ]
2868     // In this case, if IV1 has an external use, we need to avoid adding both
2869     // "last value of IV1" and "penultimate value of IV2". So, verify that we
2870     // don't already have an incoming value for the middle block.
2871     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2872       PHI->addIncoming(I.second, MiddleBlock);
2873   }
2874 }
2875 
2876 namespace {
2877 
2878 struct CSEDenseMapInfo {
2879   static bool canHandle(const Instruction *I) {
2880     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2881            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2882   }
2883 
2884   static inline Instruction *getEmptyKey() {
2885     return DenseMapInfo<Instruction *>::getEmptyKey();
2886   }
2887 
2888   static inline Instruction *getTombstoneKey() {
2889     return DenseMapInfo<Instruction *>::getTombstoneKey();
2890   }
2891 
2892   static unsigned getHashValue(const Instruction *I) {
2893     assert(canHandle(I) && "Unknown instruction!");
2894     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2895                                                            I->value_op_end()));
2896   }
2897 
2898   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2899     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2900         LHS == getTombstoneKey() || RHS == getTombstoneKey())
2901       return LHS == RHS;
2902     return LHS->isIdenticalTo(RHS);
2903   }
2904 };
2905 
2906 } // end anonymous namespace
2907 
2908 ///Perform cse of induction variable instructions.
2909 static void cse(BasicBlock *BB) {
2910   // Perform simple cse.
2911   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2912   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2913     if (!CSEDenseMapInfo::canHandle(&In))
2914       continue;
2915 
2916     // Check if we can replace this instruction with any of the
2917     // visited instructions.
2918     if (Instruction *V = CSEMap.lookup(&In)) {
2919       In.replaceAllUsesWith(V);
2920       In.eraseFromParent();
2921       continue;
2922     }
2923 
2924     CSEMap[&In] = &In;
2925   }
2926 }
2927 
2928 InstructionCost
2929 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2930                                               ElementCount VF) const {
2931   // We only need to calculate a cost if the VF is scalar; for actual vectors
2932   // we should already have a pre-calculated cost at each VF.
2933   if (!VF.isScalar())
2934     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2935 
2936   Type *RetTy = CI->getType();
2937   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2938     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2939       return *RedCost;
2940 
2941   SmallVector<Type *, 4> Tys;
2942   for (auto &ArgOp : CI->args())
2943     Tys.push_back(ArgOp->getType());
2944 
2945   InstructionCost ScalarCallCost =
2946       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2947 
2948   // If this is an intrinsic we may have a lower cost for it.
2949   if (getVectorIntrinsicIDForCall(CI, TLI)) {
2950     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2951     return std::min(ScalarCallCost, IntrinsicCost);
2952   }
2953   return ScalarCallCost;
2954 }
2955 
2956 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2957   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2958     return Elt;
2959   return VectorType::get(Elt, VF);
2960 }
2961 
2962 InstructionCost
2963 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2964                                                    ElementCount VF) const {
2965   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2966   assert(ID && "Expected intrinsic call!");
2967   Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2968   FastMathFlags FMF;
2969   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2970     FMF = FPMO->getFastMathFlags();
2971 
2972   SmallVector<const Value *> Arguments(CI->args());
2973   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2974   SmallVector<Type *> ParamTys;
2975   std::transform(FTy->param_begin(), FTy->param_end(),
2976                  std::back_inserter(ParamTys),
2977                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2978 
2979   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2980                                     dyn_cast<IntrinsicInst>(CI));
2981   return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2982 }
2983 
2984 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2985   // Fix widened non-induction PHIs by setting up the PHI operands.
2986   if (EnableVPlanNativePath)
2987     fixNonInductionPHIs(State);
2988 
2989   // Forget the original basic block.
2990   PSE.getSE()->forgetLoop(OrigLoop);
2991   PSE.getSE()->forgetBlockAndLoopDispositions();
2992 
2993   // After vectorization, the exit blocks of the original loop will have
2994   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2995   // looked through single-entry phis.
2996   SmallVector<BasicBlock *> ExitBlocks;
2997   OrigLoop->getExitBlocks(ExitBlocks);
2998   for (BasicBlock *Exit : ExitBlocks)
2999     for (PHINode &PN : Exit->phis())
3000       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3001 
3002   if (Cost->requiresScalarEpilogue(VF.isVector())) {
3003     // No edge from the middle block to the unique exit block has been inserted
3004     // and there is nothing to fix from vector loop; phis should have incoming
3005     // from scalar loop only.
3006   } else {
3007     // TODO: Check in VPlan to see if IV users need fixing instead of checking
3008     // the cost model.
3009 
3010     // If we inserted an edge from the middle block to the unique exit block,
3011     // update uses outside the loop (phis) to account for the newly inserted
3012     // edge.
3013 
3014     // Fix-up external users of the induction variables.
3015     for (const auto &Entry : Legal->getInductionVars())
3016       fixupIVUsers(Entry.first, Entry.second,
3017                    getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
3018   }
3019 
3020   // Don't apply optimizations below when no vector region remains, as they all
3021   // require a vector loop at the moment.
3022   if (!State.Plan->getVectorLoopRegion())
3023     return;
3024 
3025   for (Instruction *PI : PredicatedInstructions)
3026     sinkScalarOperands(&*PI);
3027 
3028   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3029   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3030   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
3031 
3032   // Remove redundant induction instructions.
3033   cse(HeaderBB);
3034 
3035   // Set/update profile weights for the vector and remainder loops as original
3036   // loop iterations are now distributed among them. Note that original loop
3037   // becomes the scalar remainder loop after vectorization.
3038   //
3039   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3040   // end up getting slightly roughened result but that should be OK since
3041   // profile is not inherently precise anyway. Note also possible bypass of
3042   // vector code caused by legality checks is ignored, assigning all the weight
3043   // to the vector loop, optimistically.
3044   //
3045   // For scalable vectorization we can't know at compile time how many
3046   // iterations of the loop are handled in one vector iteration, so instead
3047   // assume a pessimistic vscale of '1'.
3048   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
3049   setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
3050                                VF.getKnownMinValue() * UF);
3051 }
3052 
3053 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3054   // The basic block and loop containing the predicated instruction.
3055   auto *PredBB = PredInst->getParent();
3056   auto *VectorLoop = LI->getLoopFor(PredBB);
3057 
3058   // Initialize a worklist with the operands of the predicated instruction.
3059   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3060 
3061   // Holds instructions that we need to analyze again. An instruction may be
3062   // reanalyzed if we don't yet know if we can sink it or not.
3063   SmallVector<Instruction *, 8> InstsToReanalyze;
3064 
3065   // Returns true if a given use occurs in the predicated block. Phi nodes use
3066   // their operands in their corresponding predecessor blocks.
3067   auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
3068     auto *I = cast<Instruction>(U.getUser());
3069     BasicBlock *BB = I->getParent();
3070     if (auto *Phi = dyn_cast<PHINode>(I))
3071       BB = Phi->getIncomingBlock(
3072           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3073     return BB == PredBB;
3074   };
3075 
3076   // Iteratively sink the scalarized operands of the predicated instruction
3077   // into the block we created for it. When an instruction is sunk, it's
3078   // operands are then added to the worklist. The algorithm ends after one pass
3079   // through the worklist doesn't sink a single instruction.
3080   bool Changed;
3081   do {
3082     // Add the instructions that need to be reanalyzed to the worklist, and
3083     // reset the changed indicator.
3084     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3085     InstsToReanalyze.clear();
3086     Changed = false;
3087 
3088     while (!Worklist.empty()) {
3089       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3090 
3091       // We can't sink an instruction if it is a phi node, is not in the loop,
3092       // may have side effects or may read from memory.
3093       // TODO: Could do more granular checking to allow sinking
3094       // a load past non-store instructions.
3095       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3096           I->mayHaveSideEffects() || I->mayReadFromMemory())
3097           continue;
3098 
3099       // If the instruction is already in PredBB, check if we can sink its
3100       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3101       // sinking the scalar instruction I, hence it appears in PredBB; but it
3102       // may have failed to sink I's operands (recursively), which we try
3103       // (again) here.
3104       if (I->getParent() == PredBB) {
3105         Worklist.insert(I->op_begin(), I->op_end());
3106         continue;
3107       }
3108 
3109       // It's legal to sink the instruction if all its uses occur in the
3110       // predicated block. Otherwise, there's nothing to do yet, and we may
3111       // need to reanalyze the instruction.
3112       if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
3113         InstsToReanalyze.push_back(I);
3114         continue;
3115       }
3116 
3117       // Move the instruction to the beginning of the predicated block, and add
3118       // it's operands to the worklist.
3119       I->moveBefore(&*PredBB->getFirstInsertionPt());
3120       Worklist.insert(I->op_begin(), I->op_end());
3121 
3122       // The sinking may have enabled other instructions to be sunk, so we will
3123       // need to iterate.
3124       Changed = true;
3125     }
3126   } while (Changed);
3127 }
3128 
3129 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
3130   auto Iter = vp_depth_first_deep(Plan.getEntry());
3131   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3132     for (VPRecipeBase &P : VPBB->phis()) {
3133       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3134       if (!VPPhi)
3135         continue;
3136       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3137       // Make sure the builder has a valid insert point.
3138       Builder.SetInsertPoint(NewPhi);
3139       for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3140         VPValue *Inc = VPPhi->getIncomingValue(Idx);
3141         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3142         NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3143       }
3144     }
3145   }
3146 }
3147 
3148 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3149   // We should not collect Scalars more than once per VF. Right now, this
3150   // function is called from collectUniformsAndScalars(), which already does
3151   // this check. Collecting Scalars for VF=1 does not make any sense.
3152   assert(VF.isVector() && !Scalars.contains(VF) &&
3153          "This function should not be visited twice for the same VF");
3154 
3155   // This avoids any chances of creating a REPLICATE recipe during planning
3156   // since that would result in generation of scalarized code during execution,
3157   // which is not supported for scalable vectors.
3158   if (VF.isScalable()) {
3159     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3160     return;
3161   }
3162 
3163   SmallSetVector<Instruction *, 8> Worklist;
3164 
3165   // These sets are used to seed the analysis with pointers used by memory
3166   // accesses that will remain scalar.
3167   SmallSetVector<Instruction *, 8> ScalarPtrs;
3168   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3169   auto *Latch = TheLoop->getLoopLatch();
3170 
3171   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3172   // The pointer operands of loads and stores will be scalar as long as the
3173   // memory access is not a gather or scatter operation. The value operand of a
3174   // store will remain scalar if the store is scalarized.
3175   auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3176     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3177     assert(WideningDecision != CM_Unknown &&
3178            "Widening decision should be ready at this moment");
3179     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3180       if (Ptr == Store->getValueOperand())
3181         return WideningDecision == CM_Scalarize;
3182     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3183            "Ptr is neither a value or pointer operand");
3184     return WideningDecision != CM_GatherScatter;
3185   };
3186 
3187   // A helper that returns true if the given value is a getelementptr
3188   // instruction contained in the loop.
3189   auto IsLoopVaryingGEP = [&](Value *V) {
3190     return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3191   };
3192 
3193   // A helper that evaluates a memory access's use of a pointer. If the use will
3194   // be a scalar use and the pointer is only used by memory accesses, we place
3195   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3196   // PossibleNonScalarPtrs.
3197   auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3198     // We only care about bitcast and getelementptr instructions contained in
3199     // the loop.
3200     if (!IsLoopVaryingGEP(Ptr))
3201       return;
3202 
3203     // If the pointer has already been identified as scalar (e.g., if it was
3204     // also identified as uniform), there's nothing to do.
3205     auto *I = cast<Instruction>(Ptr);
3206     if (Worklist.count(I))
3207       return;
3208 
3209     // If the use of the pointer will be a scalar use, and all users of the
3210     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3211     // place the pointer in PossibleNonScalarPtrs.
3212     if (IsScalarUse(MemAccess, Ptr) &&
3213         all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3214       ScalarPtrs.insert(I);
3215     else
3216       PossibleNonScalarPtrs.insert(I);
3217   };
3218 
3219   // We seed the scalars analysis with three classes of instructions: (1)
3220   // instructions marked uniform-after-vectorization and (2) bitcast,
3221   // getelementptr and (pointer) phi instructions used by memory accesses
3222   // requiring a scalar use.
3223   //
3224   // (1) Add to the worklist all instructions that have been identified as
3225   // uniform-after-vectorization.
3226   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3227 
3228   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3229   // memory accesses requiring a scalar use. The pointer operands of loads and
3230   // stores will be scalar unless the operation is a gather or scatter.
3231   // The value operand of a store will remain scalar if the store is scalarized.
3232   for (auto *BB : TheLoop->blocks())
3233     for (auto &I : *BB) {
3234       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3235         EvaluatePtrUse(Load, Load->getPointerOperand());
3236       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3237         EvaluatePtrUse(Store, Store->getPointerOperand());
3238         EvaluatePtrUse(Store, Store->getValueOperand());
3239       }
3240     }
3241   for (auto *I : ScalarPtrs)
3242     if (!PossibleNonScalarPtrs.count(I)) {
3243       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3244       Worklist.insert(I);
3245     }
3246 
3247   // Insert the forced scalars.
3248   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3249   // induction variable when the PHI user is scalarized.
3250   auto ForcedScalar = ForcedScalars.find(VF);
3251   if (ForcedScalar != ForcedScalars.end())
3252     for (auto *I : ForcedScalar->second) {
3253       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3254       Worklist.insert(I);
3255     }
3256 
3257   // Expand the worklist by looking through any bitcasts and getelementptr
3258   // instructions we've already identified as scalar. This is similar to the
3259   // expansion step in collectLoopUniforms(); however, here we're only
3260   // expanding to include additional bitcasts and getelementptr instructions.
3261   unsigned Idx = 0;
3262   while (Idx != Worklist.size()) {
3263     Instruction *Dst = Worklist[Idx++];
3264     if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3265       continue;
3266     auto *Src = cast<Instruction>(Dst->getOperand(0));
3267     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3268           auto *J = cast<Instruction>(U);
3269           return !TheLoop->contains(J) || Worklist.count(J) ||
3270                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3271                   IsScalarUse(J, Src));
3272         })) {
3273       Worklist.insert(Src);
3274       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3275     }
3276   }
3277 
3278   // An induction variable will remain scalar if all users of the induction
3279   // variable and induction variable update remain scalar.
3280   for (const auto &Induction : Legal->getInductionVars()) {
3281     auto *Ind = Induction.first;
3282     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3283 
3284     // If tail-folding is applied, the primary induction variable will be used
3285     // to feed a vector compare.
3286     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3287       continue;
3288 
3289     // Returns true if \p Indvar is a pointer induction that is used directly by
3290     // load/store instruction \p I.
3291     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3292                                               Instruction *I) {
3293       return Induction.second.getKind() ==
3294                  InductionDescriptor::IK_PtrInduction &&
3295              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3296              Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3297     };
3298 
3299     // Determine if all users of the induction variable are scalar after
3300     // vectorization.
3301     bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3302       auto *I = cast<Instruction>(U);
3303       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3304              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3305     });
3306     if (!ScalarInd)
3307       continue;
3308 
3309     // If the induction variable update is a fixed-order recurrence, neither the
3310     // induction variable or its update should be marked scalar after
3311     // vectorization.
3312     auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3313     if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3314       continue;
3315 
3316     // Determine if all users of the induction variable update instruction are
3317     // scalar after vectorization.
3318     bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3319       auto *I = cast<Instruction>(U);
3320       return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3321              IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3322     });
3323     if (!ScalarIndUpdate)
3324       continue;
3325 
3326     // The induction variable and its update instruction will remain scalar.
3327     Worklist.insert(Ind);
3328     Worklist.insert(IndUpdate);
3329     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3330     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3331                       << "\n");
3332   }
3333 
3334   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3335 }
3336 
3337 bool LoopVectorizationCostModel::isScalarWithPredication(
3338     Instruction *I, ElementCount VF) const {
3339   if (!isPredicatedInst(I))
3340     return false;
3341 
3342   // Do we have a non-scalar lowering for this predicated
3343   // instruction? No - it is scalar with predication.
3344   switch(I->getOpcode()) {
3345   default:
3346     return true;
3347   case Instruction::Call:
3348     if (VF.isScalar())
3349       return true;
3350     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3351                .Kind == CM_Scalarize;
3352   case Instruction::Load:
3353   case Instruction::Store: {
3354     auto *Ptr = getLoadStorePointerOperand(I);
3355     auto *Ty = getLoadStoreType(I);
3356     Type *VTy = Ty;
3357     if (VF.isVector())
3358       VTy = VectorType::get(Ty, VF);
3359     const Align Alignment = getLoadStoreAlignment(I);
3360     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3361                                 TTI.isLegalMaskedGather(VTy, Alignment))
3362                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3363                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3364   }
3365   case Instruction::UDiv:
3366   case Instruction::SDiv:
3367   case Instruction::SRem:
3368   case Instruction::URem: {
3369     // We have the option to use the safe-divisor idiom to avoid predication.
3370     // The cost based decision here will always select safe-divisor for
3371     // scalable vectors as scalarization isn't legal.
3372     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3373     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3374   }
3375   }
3376 }
3377 
3378 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3379 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3380   // If predication is not needed, avoid it.
3381   // TODO: We can use the loop-preheader as context point here and get
3382   // context sensitive reasoning for isSafeToSpeculativelyExecute.
3383   if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3384       isSafeToSpeculativelyExecute(I) ||
3385       (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3386       isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3387     return false;
3388 
3389   // If the instruction was executed conditionally in the original scalar loop,
3390   // predication is needed with a mask whose lanes are all possibly inactive.
3391   if (Legal->blockNeedsPredication(I->getParent()))
3392     return true;
3393 
3394   // All that remain are instructions with side-effects originally executed in
3395   // the loop unconditionally, but now execute under a tail-fold mask (only)
3396   // having at least one active lane (the first). If the side-effects of the
3397   // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3398   // - it will cause the same side-effects as when masked.
3399   switch(I->getOpcode()) {
3400   default:
3401     llvm_unreachable(
3402         "instruction should have been considered by earlier checks");
3403   case Instruction::Call:
3404     // Side-effects of a Call are assumed to be non-invariant, needing a
3405     // (fold-tail) mask.
3406     assert(Legal->isMaskRequired(I) &&
3407            "should have returned earlier for calls not needing a mask");
3408     return true;
3409   case Instruction::Load:
3410     // If the address is loop invariant no predication is needed.
3411     return !Legal->isInvariant(getLoadStorePointerOperand(I));
3412   case Instruction::Store: {
3413     // For stores, we need to prove both speculation safety (which follows from
3414     // the same argument as loads), but also must prove the value being stored
3415     // is correct.  The easiest form of the later is to require that all values
3416     // stored are the same.
3417     return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3418              TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3419   }
3420   case Instruction::UDiv:
3421   case Instruction::SDiv:
3422   case Instruction::SRem:
3423   case Instruction::URem:
3424     // If the divisor is loop-invariant no predication is needed.
3425     return !TheLoop->isLoopInvariant(I->getOperand(1));
3426   }
3427 }
3428 
3429 std::pair<InstructionCost, InstructionCost>
3430 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3431                                                     ElementCount VF) const {
3432   assert(I->getOpcode() == Instruction::UDiv ||
3433          I->getOpcode() == Instruction::SDiv ||
3434          I->getOpcode() == Instruction::SRem ||
3435          I->getOpcode() == Instruction::URem);
3436   assert(!isSafeToSpeculativelyExecute(I));
3437 
3438   // Scalarization isn't legal for scalable vector types
3439   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3440   if (!VF.isScalable()) {
3441     // Get the scalarization cost and scale this amount by the probability of
3442     // executing the predicated block. If the instruction is not predicated,
3443     // we fall through to the next case.
3444     ScalarizationCost = 0;
3445 
3446     // These instructions have a non-void type, so account for the phi nodes
3447     // that we will create. This cost is likely to be zero. The phi node
3448     // cost, if any, should be scaled by the block probability because it
3449     // models a copy at the end of each predicated block.
3450     ScalarizationCost += VF.getKnownMinValue() *
3451       TTI.getCFInstrCost(Instruction::PHI, CostKind);
3452 
3453     // The cost of the non-predicated instruction.
3454     ScalarizationCost += VF.getKnownMinValue() *
3455       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3456 
3457     // The cost of insertelement and extractelement instructions needed for
3458     // scalarization.
3459     ScalarizationCost += getScalarizationOverhead(I, VF);
3460 
3461     // Scale the cost by the probability of executing the predicated blocks.
3462     // This assumes the predicated block for each vector lane is equally
3463     // likely.
3464     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3465   }
3466   InstructionCost SafeDivisorCost = 0;
3467 
3468   auto *VecTy = toVectorTy(I->getType(), VF);
3469 
3470   // The cost of the select guard to ensure all lanes are well defined
3471   // after we speculate above any internal control flow.
3472   SafeDivisorCost +=
3473       TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3474                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3475                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
3476 
3477   // Certain instructions can be cheaper to vectorize if they have a constant
3478   // second vector operand. One example of this are shifts on x86.
3479   Value *Op2 = I->getOperand(1);
3480   auto Op2Info = TTI.getOperandInfo(Op2);
3481   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3482       Legal->isInvariant(Op2))
3483     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3484 
3485   SmallVector<const Value *, 4> Operands(I->operand_values());
3486   SafeDivisorCost += TTI.getArithmeticInstrCost(
3487     I->getOpcode(), VecTy, CostKind,
3488     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3489     Op2Info, Operands, I);
3490   return {ScalarizationCost, SafeDivisorCost};
3491 }
3492 
3493 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3494     Instruction *I, ElementCount VF) const {
3495   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3496   assert(getWideningDecision(I, VF) == CM_Unknown &&
3497          "Decision should not be set yet.");
3498   auto *Group = getInterleavedAccessGroup(I);
3499   assert(Group && "Must have a group.");
3500   unsigned InterleaveFactor = Group->getFactor();
3501 
3502   // If the instruction's allocated size doesn't equal its type size, it
3503   // requires padding and will be scalarized.
3504   auto &DL = I->getDataLayout();
3505   auto *ScalarTy = getLoadStoreType(I);
3506   if (hasIrregularType(ScalarTy, DL))
3507     return false;
3508 
3509   // For scalable vectors, the only interleave factor currently supported
3510   // must be power of 2 since we require the (de)interleave2 intrinsics
3511   // instead of shufflevectors.
3512   if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
3513     return false;
3514 
3515   // If the group involves a non-integral pointer, we may not be able to
3516   // losslessly cast all values to a common type.
3517   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3518   for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3519     Instruction *Member = Group->getMember(Idx);
3520     if (!Member)
3521       continue;
3522     auto *MemberTy = getLoadStoreType(Member);
3523     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3524     // Don't coerce non-integral pointers to integers or vice versa.
3525     if (MemberNI != ScalarNI)
3526       // TODO: Consider adding special nullptr value case here
3527       return false;
3528     if (MemberNI && ScalarNI &&
3529         ScalarTy->getPointerAddressSpace() !=
3530             MemberTy->getPointerAddressSpace())
3531       return false;
3532   }
3533 
3534   // Check if masking is required.
3535   // A Group may need masking for one of two reasons: it resides in a block that
3536   // needs predication, or it was decided to use masking to deal with gaps
3537   // (either a gap at the end of a load-access that may result in a speculative
3538   // load, or any gaps in a store-access).
3539   bool PredicatedAccessRequiresMasking =
3540       blockNeedsPredicationForAnyReason(I->getParent()) &&
3541       Legal->isMaskRequired(I);
3542   bool LoadAccessWithGapsRequiresEpilogMasking =
3543       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3544       !isScalarEpilogueAllowed();
3545   bool StoreAccessWithGapsRequiresMasking =
3546       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3547   if (!PredicatedAccessRequiresMasking &&
3548       !LoadAccessWithGapsRequiresEpilogMasking &&
3549       !StoreAccessWithGapsRequiresMasking)
3550     return true;
3551 
3552   // If masked interleaving is required, we expect that the user/target had
3553   // enabled it, because otherwise it either wouldn't have been created or
3554   // it should have been invalidated by the CostModel.
3555   assert(useMaskedInterleavedAccesses(TTI) &&
3556          "Masked interleave-groups for predicated accesses are not enabled.");
3557 
3558   if (Group->isReverse())
3559     return false;
3560 
3561   auto *Ty = getLoadStoreType(I);
3562   const Align Alignment = getLoadStoreAlignment(I);
3563   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3564                           : TTI.isLegalMaskedStore(Ty, Alignment);
3565 }
3566 
3567 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3568     Instruction *I, ElementCount VF) {
3569   // Get and ensure we have a valid memory instruction.
3570   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3571 
3572   auto *Ptr = getLoadStorePointerOperand(I);
3573   auto *ScalarTy = getLoadStoreType(I);
3574 
3575   // In order to be widened, the pointer should be consecutive, first of all.
3576   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3577     return false;
3578 
3579   // If the instruction is a store located in a predicated block, it will be
3580   // scalarized.
3581   if (isScalarWithPredication(I, VF))
3582     return false;
3583 
3584   // If the instruction's allocated size doesn't equal it's type size, it
3585   // requires padding and will be scalarized.
3586   auto &DL = I->getDataLayout();
3587   if (hasIrregularType(ScalarTy, DL))
3588     return false;
3589 
3590   return true;
3591 }
3592 
3593 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3594   // We should not collect Uniforms more than once per VF. Right now,
3595   // this function is called from collectUniformsAndScalars(), which
3596   // already does this check. Collecting Uniforms for VF=1 does not make any
3597   // sense.
3598 
3599   assert(VF.isVector() && !Uniforms.contains(VF) &&
3600          "This function should not be visited twice for the same VF");
3601 
3602   // Visit the list of Uniforms. If we find no uniform value, we won't
3603   // analyze again.  Uniforms.count(VF) will return 1.
3604   Uniforms[VF].clear();
3605 
3606   // Now we know that the loop is vectorizable!
3607   // Collect instructions inside the loop that will remain uniform after
3608   // vectorization.
3609 
3610   // Global values, params and instructions outside of current loop are out of
3611   // scope.
3612   auto IsOutOfScope = [&](Value *V) -> bool {
3613     Instruction *I = dyn_cast<Instruction>(V);
3614     return (!I || !TheLoop->contains(I));
3615   };
3616 
3617   // Worklist containing uniform instructions demanding lane 0.
3618   SetVector<Instruction *> Worklist;
3619 
3620   // Add uniform instructions demanding lane 0 to the worklist. Instructions
3621   // that require predication must not be considered uniform after
3622   // vectorization, because that would create an erroneous replicating region
3623   // where only a single instance out of VF should be formed.
3624   auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3625     if (IsOutOfScope(I)) {
3626       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3627                         << *I << "\n");
3628       return;
3629     }
3630     if (isPredicatedInst(I)) {
3631       LLVM_DEBUG(
3632           dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3633                  << "\n");
3634       return;
3635     }
3636     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3637     Worklist.insert(I);
3638   };
3639 
3640   // Start with the conditional branches exiting the loop. If the branch
3641   // condition is an instruction contained in the loop that is only used by the
3642   // branch, it is uniform. Note conditions from uncountable early exits are not
3643   // uniform.
3644   SmallVector<BasicBlock *> Exiting;
3645   TheLoop->getExitingBlocks(Exiting);
3646   for (BasicBlock *E : Exiting) {
3647     if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3648       continue;
3649     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3650     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3651       AddToWorklistIfAllowed(Cmp);
3652   }
3653 
3654   auto PrevVF = VF.divideCoefficientBy(2);
3655   // Return true if all lanes perform the same memory operation, and we can
3656   // thus choose to execute only one.
3657   auto IsUniformMemOpUse = [&](Instruction *I) {
3658     // If the value was already known to not be uniform for the previous
3659     // (smaller VF), it cannot be uniform for the larger VF.
3660     if (PrevVF.isVector()) {
3661       auto Iter = Uniforms.find(PrevVF);
3662       if (Iter != Uniforms.end() && !Iter->second.contains(I))
3663         return false;
3664     }
3665     if (!Legal->isUniformMemOp(*I, VF))
3666       return false;
3667     if (isa<LoadInst>(I))
3668       // Loading the same address always produces the same result - at least
3669       // assuming aliasing and ordering which have already been checked.
3670       return true;
3671     // Storing the same value on every iteration.
3672     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3673   };
3674 
3675   auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3676     InstWidening WideningDecision = getWideningDecision(I, VF);
3677     assert(WideningDecision != CM_Unknown &&
3678            "Widening decision should be ready at this moment");
3679 
3680     if (IsUniformMemOpUse(I))
3681       return true;
3682 
3683     return (WideningDecision == CM_Widen ||
3684             WideningDecision == CM_Widen_Reverse ||
3685             WideningDecision == CM_Interleave);
3686   };
3687 
3688   // Returns true if Ptr is the pointer operand of a memory access instruction
3689   // I, I is known to not require scalarization, and the pointer is not also
3690   // stored.
3691   auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3692     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3693       return false;
3694     return getLoadStorePointerOperand(I) == Ptr &&
3695            (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3696   };
3697 
3698   // Holds a list of values which are known to have at least one uniform use.
3699   // Note that there may be other uses which aren't uniform.  A "uniform use"
3700   // here is something which only demands lane 0 of the unrolled iterations;
3701   // it does not imply that all lanes produce the same value (e.g. this is not
3702   // the usual meaning of uniform)
3703   SetVector<Value *> HasUniformUse;
3704 
3705   // Scan the loop for instructions which are either a) known to have only
3706   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3707   for (auto *BB : TheLoop->blocks())
3708     for (auto &I : *BB) {
3709       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3710         switch (II->getIntrinsicID()) {
3711         case Intrinsic::sideeffect:
3712         case Intrinsic::experimental_noalias_scope_decl:
3713         case Intrinsic::assume:
3714         case Intrinsic::lifetime_start:
3715         case Intrinsic::lifetime_end:
3716           if (TheLoop->hasLoopInvariantOperands(&I))
3717             AddToWorklistIfAllowed(&I);
3718           break;
3719         default:
3720           break;
3721         }
3722       }
3723 
3724       // ExtractValue instructions must be uniform, because the operands are
3725       // known to be loop-invariant.
3726       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3727         assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3728                "Expected aggregate value to be loop invariant");
3729         AddToWorklistIfAllowed(EVI);
3730         continue;
3731       }
3732 
3733       // If there's no pointer operand, there's nothing to do.
3734       auto *Ptr = getLoadStorePointerOperand(&I);
3735       if (!Ptr)
3736         continue;
3737 
3738       if (IsUniformMemOpUse(&I))
3739         AddToWorklistIfAllowed(&I);
3740 
3741       if (IsVectorizedMemAccessUse(&I, Ptr))
3742         HasUniformUse.insert(Ptr);
3743     }
3744 
3745   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3746   // demanding) users.  Since loops are assumed to be in LCSSA form, this
3747   // disallows uses outside the loop as well.
3748   for (auto *V : HasUniformUse) {
3749     if (IsOutOfScope(V))
3750       continue;
3751     auto *I = cast<Instruction>(V);
3752     bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3753       auto *UI = cast<Instruction>(U);
3754       return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3755     });
3756     if (UsersAreMemAccesses)
3757       AddToWorklistIfAllowed(I);
3758   }
3759 
3760   // Expand Worklist in topological order: whenever a new instruction
3761   // is added , its users should be already inside Worklist.  It ensures
3762   // a uniform instruction will only be used by uniform instructions.
3763   unsigned Idx = 0;
3764   while (Idx != Worklist.size()) {
3765     Instruction *I = Worklist[Idx++];
3766 
3767     for (auto *OV : I->operand_values()) {
3768       // isOutOfScope operands cannot be uniform instructions.
3769       if (IsOutOfScope(OV))
3770         continue;
3771       // First order recurrence Phi's should typically be considered
3772       // non-uniform.
3773       auto *OP = dyn_cast<PHINode>(OV);
3774       if (OP && Legal->isFixedOrderRecurrence(OP))
3775         continue;
3776       // If all the users of the operand are uniform, then add the
3777       // operand into the uniform worklist.
3778       auto *OI = cast<Instruction>(OV);
3779       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3780             auto *J = cast<Instruction>(U);
3781             return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3782           }))
3783         AddToWorklistIfAllowed(OI);
3784     }
3785   }
3786 
3787   // For an instruction to be added into Worklist above, all its users inside
3788   // the loop should also be in Worklist. However, this condition cannot be
3789   // true for phi nodes that form a cyclic dependence. We must process phi
3790   // nodes separately. An induction variable will remain uniform if all users
3791   // of the induction variable and induction variable update remain uniform.
3792   // The code below handles both pointer and non-pointer induction variables.
3793   BasicBlock *Latch = TheLoop->getLoopLatch();
3794   for (const auto &Induction : Legal->getInductionVars()) {
3795     auto *Ind = Induction.first;
3796     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3797 
3798     // Determine if all users of the induction variable are uniform after
3799     // vectorization.
3800     bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3801       auto *I = cast<Instruction>(U);
3802       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3803              IsVectorizedMemAccessUse(I, Ind);
3804     });
3805     if (!UniformInd)
3806       continue;
3807 
3808     // Determine if all users of the induction variable update instruction are
3809     // uniform after vectorization.
3810     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3811       auto *I = cast<Instruction>(U);
3812       return I == Ind || Worklist.count(I) ||
3813              IsVectorizedMemAccessUse(I, IndUpdate);
3814     });
3815     if (!UniformIndUpdate)
3816       continue;
3817 
3818     // The induction variable and its update instruction will remain uniform.
3819     AddToWorklistIfAllowed(Ind);
3820     AddToWorklistIfAllowed(IndUpdate);
3821   }
3822 
3823   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3824 }
3825 
3826 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3827   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3828 
3829   if (Legal->getRuntimePointerChecking()->Need) {
3830     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3831         "runtime pointer checks needed. Enable vectorization of this "
3832         "loop with '#pragma clang loop vectorize(enable)' when "
3833         "compiling with -Os/-Oz",
3834         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3835     return true;
3836   }
3837 
3838   if (!PSE.getPredicate().isAlwaysTrue()) {
3839     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3840         "runtime SCEV checks needed. Enable vectorization of this "
3841         "loop with '#pragma clang loop vectorize(enable)' when "
3842         "compiling with -Os/-Oz",
3843         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3844     return true;
3845   }
3846 
3847   // FIXME: Avoid specializing for stride==1 instead of bailing out.
3848   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3849     reportVectorizationFailure("Runtime stride check for small trip count",
3850         "runtime stride == 1 checks needed. Enable vectorization of "
3851         "this loop without such check by compiling with -Os/-Oz",
3852         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3853     return true;
3854   }
3855 
3856   return false;
3857 }
3858 
3859 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3860   if (IsScalableVectorizationAllowed)
3861     return *IsScalableVectorizationAllowed;
3862 
3863   IsScalableVectorizationAllowed = false;
3864   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3865     return false;
3866 
3867   if (Hints->isScalableVectorizationDisabled()) {
3868     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3869                             "ScalableVectorizationDisabled", ORE, TheLoop);
3870     return false;
3871   }
3872 
3873   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3874 
3875   auto MaxScalableVF = ElementCount::getScalable(
3876       std::numeric_limits<ElementCount::ScalarTy>::max());
3877 
3878   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3879   // FIXME: While for scalable vectors this is currently sufficient, this should
3880   // be replaced by a more detailed mechanism that filters out specific VFs,
3881   // instead of invalidating vectorization for a whole set of VFs based on the
3882   // MaxVF.
3883 
3884   // Disable scalable vectorization if the loop contains unsupported reductions.
3885   if (!canVectorizeReductions(MaxScalableVF)) {
3886     reportVectorizationInfo(
3887         "Scalable vectorization not supported for the reduction "
3888         "operations found in this loop.",
3889         "ScalableVFUnfeasible", ORE, TheLoop);
3890     return false;
3891   }
3892 
3893   // Disable scalable vectorization if the loop contains any instructions
3894   // with element types not supported for scalable vectors.
3895   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3896         return !Ty->isVoidTy() &&
3897                !this->TTI.isElementTypeLegalForScalableVector(Ty);
3898       })) {
3899     reportVectorizationInfo("Scalable vectorization is not supported "
3900                             "for all element types found in this loop.",
3901                             "ScalableVFUnfeasible", ORE, TheLoop);
3902     return false;
3903   }
3904 
3905   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3906     reportVectorizationInfo("The target does not provide maximum vscale value "
3907                             "for safe distance analysis.",
3908                             "ScalableVFUnfeasible", ORE, TheLoop);
3909     return false;
3910   }
3911 
3912   IsScalableVectorizationAllowed = true;
3913   return true;
3914 }
3915 
3916 ElementCount
3917 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3918   if (!isScalableVectorizationAllowed())
3919     return ElementCount::getScalable(0);
3920 
3921   auto MaxScalableVF = ElementCount::getScalable(
3922       std::numeric_limits<ElementCount::ScalarTy>::max());
3923   if (Legal->isSafeForAnyVectorWidth())
3924     return MaxScalableVF;
3925 
3926   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3927   // Limit MaxScalableVF by the maximum safe dependence distance.
3928   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3929 
3930   if (!MaxScalableVF)
3931     reportVectorizationInfo(
3932         "Max legal vector width too small, scalable vectorization "
3933         "unfeasible.",
3934         "ScalableVFUnfeasible", ORE, TheLoop);
3935 
3936   return MaxScalableVF;
3937 }
3938 
3939 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3940     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3941   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3942   unsigned SmallestType, WidestType;
3943   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3944 
3945   // Get the maximum safe dependence distance in bits computed by LAA.
3946   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3947   // the memory accesses that is most restrictive (involved in the smallest
3948   // dependence distance).
3949   unsigned MaxSafeElements =
3950       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3951 
3952   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3953   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3954   if (!Legal->isSafeForAnyVectorWidth())
3955     this->MaxSafeElements = MaxSafeElements;
3956 
3957   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3958                     << ".\n");
3959   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3960                     << ".\n");
3961 
3962   // First analyze the UserVF, fall back if the UserVF should be ignored.
3963   if (UserVF) {
3964     auto MaxSafeUserVF =
3965         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3966 
3967     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3968       // If `VF=vscale x N` is safe, then so is `VF=N`
3969       if (UserVF.isScalable())
3970         return FixedScalableVFPair(
3971             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3972 
3973       return UserVF;
3974     }
3975 
3976     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3977 
3978     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3979     // is better to ignore the hint and let the compiler choose a suitable VF.
3980     if (!UserVF.isScalable()) {
3981       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3982                         << " is unsafe, clamping to max safe VF="
3983                         << MaxSafeFixedVF << ".\n");
3984       ORE->emit([&]() {
3985         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3986                                           TheLoop->getStartLoc(),
3987                                           TheLoop->getHeader())
3988                << "User-specified vectorization factor "
3989                << ore::NV("UserVectorizationFactor", UserVF)
3990                << " is unsafe, clamping to maximum safe vectorization factor "
3991                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3992       });
3993       return MaxSafeFixedVF;
3994     }
3995 
3996     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3997       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3998                         << " is ignored because scalable vectors are not "
3999                            "available.\n");
4000       ORE->emit([&]() {
4001         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4002                                           TheLoop->getStartLoc(),
4003                                           TheLoop->getHeader())
4004                << "User-specified vectorization factor "
4005                << ore::NV("UserVectorizationFactor", UserVF)
4006                << " is ignored because the target does not support scalable "
4007                   "vectors. The compiler will pick a more suitable value.";
4008       });
4009     } else {
4010       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4011                         << " is unsafe. Ignoring scalable UserVF.\n");
4012       ORE->emit([&]() {
4013         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4014                                           TheLoop->getStartLoc(),
4015                                           TheLoop->getHeader())
4016                << "User-specified vectorization factor "
4017                << ore::NV("UserVectorizationFactor", UserVF)
4018                << " is unsafe. Ignoring the hint to let the compiler pick a "
4019                   "more suitable value.";
4020       });
4021     }
4022   }
4023 
4024   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4025                     << " / " << WidestType << " bits.\n");
4026 
4027   FixedScalableVFPair Result(ElementCount::getFixed(1),
4028                              ElementCount::getScalable(0));
4029   if (auto MaxVF =
4030           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4031                                   MaxSafeFixedVF, FoldTailByMasking))
4032     Result.FixedVF = MaxVF;
4033 
4034   if (auto MaxVF =
4035           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4036                                   MaxSafeScalableVF, FoldTailByMasking))
4037     if (MaxVF.isScalable()) {
4038       Result.ScalableVF = MaxVF;
4039       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4040                         << "\n");
4041     }
4042 
4043   return Result;
4044 }
4045 
4046 FixedScalableVFPair
4047 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4048   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4049     // TODO: It may be useful to do since it's still likely to be dynamically
4050     // uniform if the target can skip.
4051     reportVectorizationFailure(
4052         "Not inserting runtime ptr check for divergent target",
4053         "runtime pointer checks needed. Not enabled for divergent target",
4054         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4055     return FixedScalableVFPair::getNone();
4056   }
4057 
4058   ScalarEvolution *SE = PSE.getSE();
4059   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
4060   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
4061   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4062   if (TC != MaxTC)
4063     LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
4064   if (TC == 1) {
4065     reportVectorizationFailure("Single iteration (non) loop",
4066         "loop trip count is one, irrelevant for vectorization",
4067         "SingleIterationLoop", ORE, TheLoop);
4068     return FixedScalableVFPair::getNone();
4069   }
4070 
4071   // If BTC matches the widest induction type and is -1 then the trip count
4072   // computation will wrap to 0 and the vector trip count will be 0. Do not try
4073   // to vectorize.
4074   const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
4075   if (!isa<SCEVCouldNotCompute>(BTC) &&
4076       BTC->getType()->getScalarSizeInBits() >=
4077           Legal->getWidestInductionType()->getScalarSizeInBits() &&
4078       SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,
4079                            SE->getMinusOne(BTC->getType()))) {
4080     reportVectorizationFailure(
4081         "Trip count computation wrapped",
4082         "backedge-taken count is -1, loop trip count wrapped to 0",
4083         "TripCountWrapped", ORE, TheLoop);
4084     return FixedScalableVFPair::getNone();
4085   }
4086 
4087   switch (ScalarEpilogueStatus) {
4088   case CM_ScalarEpilogueAllowed:
4089     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4090   case CM_ScalarEpilogueNotAllowedUsePredicate:
4091     [[fallthrough]];
4092   case CM_ScalarEpilogueNotNeededUsePredicate:
4093     LLVM_DEBUG(
4094         dbgs() << "LV: vector predicate hint/switch found.\n"
4095                << "LV: Not allowing scalar epilogue, creating predicated "
4096                << "vector loop.\n");
4097     break;
4098   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4099     // fallthrough as a special case of OptForSize
4100   case CM_ScalarEpilogueNotAllowedOptSize:
4101     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4102       LLVM_DEBUG(
4103           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4104     else
4105       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4106                         << "count.\n");
4107 
4108     // Bail if runtime checks are required, which are not good when optimising
4109     // for size.
4110     if (runtimeChecksRequired())
4111       return FixedScalableVFPair::getNone();
4112 
4113     break;
4114   }
4115 
4116   // The only loops we can vectorize without a scalar epilogue, are loops with
4117   // a bottom-test and a single exiting block. We'd have to handle the fact
4118   // that not every instruction executes on the last iteration.  This will
4119   // require a lane mask which varies through the vector loop body.  (TODO)
4120   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4121     // If there was a tail-folding hint/switch, but we can't fold the tail by
4122     // masking, fallback to a vectorization with a scalar epilogue.
4123     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4124       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4125                            "scalar epilogue instead.\n");
4126       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4127       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4128     }
4129     return FixedScalableVFPair::getNone();
4130   }
4131 
4132   // Now try the tail folding
4133 
4134   // Invalidate interleave groups that require an epilogue if we can't mask
4135   // the interleave-group.
4136   if (!useMaskedInterleavedAccesses(TTI)) {
4137     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4138            "No decisions should have been taken at this point");
4139     // Note: There is no need to invalidate any cost modeling decisions here, as
4140     // none were taken so far.
4141     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4142   }
4143 
4144   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4145 
4146   // Avoid tail folding if the trip count is known to be a multiple of any VF
4147   // we choose.
4148   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4149       MaxFactors.FixedVF.getFixedValue();
4150   if (MaxFactors.ScalableVF) {
4151     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4152     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4153       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4154           *MaxPowerOf2RuntimeVF,
4155           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4156     } else
4157       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4158   }
4159 
4160   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4161     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4162            "MaxFixedVF must be a power of 2");
4163     unsigned MaxVFtimesIC =
4164         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4165     ScalarEvolution *SE = PSE.getSE();
4166     // Currently only loops with countable exits are vectorized, but calling
4167     // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4168     // uncountable exits whilst also ensuring the symbolic maximum and known
4169     // back-edge taken count remain identical for loops with countable exits.
4170     const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4171     assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4172            "Invalid loop count");
4173     const SCEV *ExitCount = SE->getAddExpr(
4174         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4175     const SCEV *Rem = SE->getURemExpr(
4176         SE->applyLoopGuards(ExitCount, TheLoop),
4177         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4178     if (Rem->isZero()) {
4179       // Accept MaxFixedVF if we do not have a tail.
4180       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4181       return MaxFactors;
4182     }
4183   }
4184 
4185   // If we don't know the precise trip count, or if the trip count that we
4186   // found modulo the vectorization factor is not zero, try to fold the tail
4187   // by masking.
4188   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4189   setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4190   if (foldTailByMasking()) {
4191     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4192       LLVM_DEBUG(
4193           dbgs()
4194           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4195              "try to generate VP Intrinsics with scalable vector "
4196              "factors only.\n");
4197       // Tail folded loop using VP intrinsics restricts the VF to be scalable
4198       // for now.
4199       // TODO: extend it for fixed vectors, if required.
4200       assert(MaxFactors.ScalableVF.isScalable() &&
4201              "Expected scalable vector factor.");
4202 
4203       MaxFactors.FixedVF = ElementCount::getFixed(1);
4204     }
4205     return MaxFactors;
4206   }
4207 
4208   // If there was a tail-folding hint/switch, but we can't fold the tail by
4209   // masking, fallback to a vectorization with a scalar epilogue.
4210   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4211     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4212                          "scalar epilogue instead.\n");
4213     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4214     return MaxFactors;
4215   }
4216 
4217   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4218     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4219     return FixedScalableVFPair::getNone();
4220   }
4221 
4222   if (TC == 0) {
4223     reportVectorizationFailure(
4224         "unable to calculate the loop count due to complex control flow",
4225         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4226     return FixedScalableVFPair::getNone();
4227   }
4228 
4229   reportVectorizationFailure(
4230       "Cannot optimize for size and vectorize at the same time.",
4231       "cannot optimize for size and vectorize at the same time. "
4232       "Enable vectorization of this loop with '#pragma clang loop "
4233       "vectorize(enable)' when compiling with -Os/-Oz",
4234       "NoTailLoopWithOptForSize", ORE, TheLoop);
4235   return FixedScalableVFPair::getNone();
4236 }
4237 
4238 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4239     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4240     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4241   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4242   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4243       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4244                            : TargetTransformInfo::RGK_FixedWidthVector);
4245 
4246   // Convenience function to return the minimum of two ElementCounts.
4247   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4248     assert((LHS.isScalable() == RHS.isScalable()) &&
4249            "Scalable flags must match");
4250     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4251   };
4252 
4253   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4254   // Note that both WidestRegister and WidestType may not be a powers of 2.
4255   auto MaxVectorElementCount = ElementCount::get(
4256       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4257       ComputeScalableMaxVF);
4258   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4259   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4260                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4261 
4262   if (!MaxVectorElementCount) {
4263     LLVM_DEBUG(dbgs() << "LV: The target has no "
4264                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4265                       << " vector registers.\n");
4266     return ElementCount::getFixed(1);
4267   }
4268 
4269   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4270   if (MaxVectorElementCount.isScalable() &&
4271       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4272     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4273     auto Min = Attr.getVScaleRangeMin();
4274     WidestRegisterMinEC *= Min;
4275   }
4276 
4277   // When a scalar epilogue is required, at least one iteration of the scalar
4278   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4279   // max VF that results in a dead vector loop.
4280   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4281     MaxTripCount -= 1;
4282 
4283   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4284       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4285     // If upper bound loop trip count (TC) is known at compile time there is no
4286     // point in choosing VF greater than TC (as done in the loop below). Select
4287     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4288     // scalable, we only fall back on a fixed VF when the TC is less than or
4289     // equal to the known number of lanes.
4290     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4291     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4292                          "exceeding the constant trip count: "
4293                       << ClampedUpperTripCount << "\n");
4294     return ElementCount::get(
4295         ClampedUpperTripCount,
4296         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4297   }
4298 
4299   TargetTransformInfo::RegisterKind RegKind =
4300       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4301                            : TargetTransformInfo::RGK_FixedWidthVector;
4302   ElementCount MaxVF = MaxVectorElementCount;
4303   if (MaximizeBandwidth ||
4304       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4305        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4306         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4307     auto MaxVectorElementCountMaxBW = ElementCount::get(
4308         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4309         ComputeScalableMaxVF);
4310     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4311 
4312     // Collect all viable vectorization factors larger than the default MaxVF
4313     // (i.e. MaxVectorElementCount).
4314     SmallVector<ElementCount, 8> VFs;
4315     for (ElementCount VS = MaxVectorElementCount * 2;
4316          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4317       VFs.push_back(VS);
4318 
4319     // For each VF calculate its register usage.
4320     auto RUs = calculateRegisterUsage(VFs);
4321 
4322     // Select the largest VF which doesn't require more registers than existing
4323     // ones.
4324     for (int I = RUs.size() - 1; I >= 0; --I) {
4325       const auto &MLU = RUs[I].MaxLocalUsers;
4326       if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4327             return LU.second <= TTI.getNumberOfRegisters(LU.first);
4328           })) {
4329         MaxVF = VFs[I];
4330         break;
4331       }
4332     }
4333     if (ElementCount MinVF =
4334             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4335       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4336         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4337                           << ") with target's minimum: " << MinVF << '\n');
4338         MaxVF = MinVF;
4339       }
4340     }
4341 
4342     // Invalidate any widening decisions we might have made, in case the loop
4343     // requires prediction (decided later), but we have already made some
4344     // load/store widening decisions.
4345     invalidateCostModelingDecisions();
4346   }
4347   return MaxVF;
4348 }
4349 
4350 /// Convenience function that returns the value of vscale_range iff
4351 /// vscale_range.min == vscale_range.max or otherwise returns the value
4352 /// returned by the corresponding TTI method.
4353 static std::optional<unsigned>
4354 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4355   const Function *Fn = L->getHeader()->getParent();
4356   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4357     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4358     auto Min = Attr.getVScaleRangeMin();
4359     auto Max = Attr.getVScaleRangeMax();
4360     if (Max && Min == Max)
4361       return Max;
4362   }
4363 
4364   return TTI.getVScaleForTuning();
4365 }
4366 
4367 /// This function attempts to return a value that represents the vectorization
4368 /// factor at runtime. For fixed-width VFs we know this precisely at compile
4369 /// time, but for scalable VFs we calculate it based on an estimate of the
4370 /// vscale value.
4371 static unsigned getEstimatedRuntimeVF(const Loop *L,
4372                                       const TargetTransformInfo &TTI,
4373                                       ElementCount VF) {
4374   unsigned EstimatedVF = VF.getKnownMinValue();
4375   if (VF.isScalable())
4376     if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4377       EstimatedVF *= *VScale;
4378   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4379   return EstimatedVF;
4380 }
4381 
4382 bool LoopVectorizationPlanner::isMoreProfitable(
4383     const VectorizationFactor &A, const VectorizationFactor &B,
4384     const unsigned MaxTripCount) const {
4385   InstructionCost CostA = A.Cost;
4386   InstructionCost CostB = B.Cost;
4387 
4388   // Improve estimate for the vector width if it is scalable.
4389   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4390   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4391   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4392     if (A.Width.isScalable())
4393       EstimatedWidthA *= *VScale;
4394     if (B.Width.isScalable())
4395       EstimatedWidthB *= *VScale;
4396   }
4397 
4398   // Assume vscale may be larger than 1 (or the value being tuned for),
4399   // so that scalable vectorization is slightly favorable over fixed-width
4400   // vectorization.
4401   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4402                         A.Width.isScalable() && !B.Width.isScalable();
4403 
4404   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4405                                 const InstructionCost &RHS) {
4406     return PreferScalable ? LHS <= RHS : LHS < RHS;
4407   };
4408 
4409   // To avoid the need for FP division:
4410   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4411   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4412   if (!MaxTripCount)
4413     return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4414 
4415   auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4416                                            InstructionCost VectorCost,
4417                                            InstructionCost ScalarCost) {
4418     // If the trip count is a known (possibly small) constant, the trip count
4419     // will be rounded up to an integer number of iterations under
4420     // FoldTailByMasking. The total cost in that case will be
4421     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4422     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4423     // some extra overheads, but for the purpose of comparing the costs of
4424     // different VFs we can use this to compare the total loop-body cost
4425     // expected after vectorization.
4426     if (CM.foldTailByMasking())
4427       return VectorCost * divideCeil(MaxTripCount, VF);
4428     return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4429   };
4430 
4431   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4432   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4433   return CmpFn(RTCostA, RTCostB);
4434 }
4435 
4436 bool LoopVectorizationPlanner::isMoreProfitable(
4437     const VectorizationFactor &A, const VectorizationFactor &B) const {
4438   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4439   return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4440 }
4441 
4442 void LoopVectorizationPlanner::emitInvalidCostRemarks(
4443     OptimizationRemarkEmitter *ORE) {
4444   using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4445   SmallVector<RecipeVFPair> InvalidCosts;
4446   for (const auto &Plan : VPlans) {
4447     for (ElementCount VF : Plan->vectorFactors()) {
4448       VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4449                             CM, CM.CostKind);
4450       precomputeCosts(*Plan, VF, CostCtx);
4451       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4452       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4453         for (auto &R : *VPBB) {
4454           if (!R.cost(VF, CostCtx).isValid())
4455             InvalidCosts.emplace_back(&R, VF);
4456         }
4457       }
4458     }
4459   }
4460   if (InvalidCosts.empty())
4461     return;
4462 
4463   // Emit a report of VFs with invalid costs in the loop.
4464 
4465   // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4466   DenseMap<VPRecipeBase *, unsigned> Numbering;
4467   unsigned I = 0;
4468   for (auto &Pair : InvalidCosts)
4469     if (!Numbering.count(Pair.first))
4470       Numbering[Pair.first] = I++;
4471 
4472   // Sort the list, first on recipe(number) then on VF.
4473   sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4474     if (Numbering[A.first] != Numbering[B.first])
4475       return Numbering[A.first] < Numbering[B.first];
4476     const auto &LHS = A.second;
4477     const auto &RHS = B.second;
4478     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4479            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4480   });
4481 
4482   // For a list of ordered recipe-VF pairs:
4483   //   [(load, VF1), (load, VF2), (store, VF1)]
4484   // group the recipes together to emit separate remarks for:
4485   //   load  (VF1, VF2)
4486   //   store (VF1)
4487   auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4488   auto Subset = ArrayRef<RecipeVFPair>();
4489   do {
4490     if (Subset.empty())
4491       Subset = Tail.take_front(1);
4492 
4493     VPRecipeBase *R = Subset.front().first;
4494 
4495     unsigned Opcode =
4496         TypeSwitch<const VPRecipeBase *, unsigned>(R)
4497             .Case<VPHeaderPHIRecipe>(
4498                 [](const auto *R) { return Instruction::PHI; })
4499             .Case<VPWidenSelectRecipe>(
4500                 [](const auto *R) { return Instruction::Select; })
4501             .Case<VPWidenStoreRecipe>(
4502                 [](const auto *R) { return Instruction::Store; })
4503             .Case<VPWidenLoadRecipe>(
4504                 [](const auto *R) { return Instruction::Load; })
4505             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4506                 [](const auto *R) { return Instruction::Call; })
4507             .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4508                   VPWidenCastRecipe>(
4509                 [](const auto *R) { return R->getOpcode(); })
4510             .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4511               return R->getStoredValues().empty() ? Instruction::Load
4512                                                   : Instruction::Store;
4513             });
4514 
4515     // If the next recipe is different, or if there are no other pairs,
4516     // emit a remark for the collated subset. e.g.
4517     //   [(load, VF1), (load, VF2))]
4518     // to emit:
4519     //  remark: invalid costs for 'load' at VF=(VF1, VF2)
4520     if (Subset == Tail || Tail[Subset.size()].first != R) {
4521       std::string OutString;
4522       raw_string_ostream OS(OutString);
4523       assert(!Subset.empty() && "Unexpected empty range");
4524       OS << "Recipe with invalid costs prevented vectorization at VF=(";
4525       for (const auto &Pair : Subset)
4526         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4527       OS << "):";
4528       if (Opcode == Instruction::Call) {
4529         StringRef Name = "";
4530         if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4531           Name = Int->getIntrinsicName();
4532         } else {
4533           auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4534           Function *CalledFn =
4535               WidenCall ? WidenCall->getCalledScalarFunction()
4536                         : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4537                                              ->getLiveInIRValue());
4538           Name = CalledFn->getName();
4539         }
4540         OS << " call to " << Name;
4541       } else
4542         OS << " " << Instruction::getOpcodeName(Opcode);
4543       reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4544                               R->getDebugLoc());
4545       Tail = Tail.drop_front(Subset.size());
4546       Subset = {};
4547     } else
4548       // Grow the subset by one element
4549       Subset = Tail.take_front(Subset.size() + 1);
4550   } while (!Tail.empty());
4551 }
4552 
4553 /// Check if any recipe of \p Plan will generate a vector value, which will be
4554 /// assigned a vector register.
4555 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4556                                 const TargetTransformInfo &TTI) {
4557   assert(VF.isVector() && "Checking a scalar VF?");
4558   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4559   DenseSet<VPRecipeBase *> EphemeralRecipes;
4560   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4561   // Set of already visited types.
4562   DenseSet<Type *> Visited;
4563   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4564            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4565     for (VPRecipeBase &R : *VPBB) {
4566       if (EphemeralRecipes.contains(&R))
4567         continue;
4568       // Continue early if the recipe is considered to not produce a vector
4569       // result. Note that this includes VPInstruction where some opcodes may
4570       // produce a vector, to preserve existing behavior as VPInstructions model
4571       // aspects not directly mapped to existing IR instructions.
4572       switch (R.getVPDefID()) {
4573       case VPDef::VPDerivedIVSC:
4574       case VPDef::VPScalarIVStepsSC:
4575       case VPDef::VPScalarCastSC:
4576       case VPDef::VPReplicateSC:
4577       case VPDef::VPInstructionSC:
4578       case VPDef::VPCanonicalIVPHISC:
4579       case VPDef::VPVectorPointerSC:
4580       case VPDef::VPReverseVectorPointerSC:
4581       case VPDef::VPExpandSCEVSC:
4582       case VPDef::VPEVLBasedIVPHISC:
4583       case VPDef::VPPredInstPHISC:
4584       case VPDef::VPBranchOnMaskSC:
4585         continue;
4586       case VPDef::VPReductionSC:
4587       case VPDef::VPActiveLaneMaskPHISC:
4588       case VPDef::VPWidenCallSC:
4589       case VPDef::VPWidenCanonicalIVSC:
4590       case VPDef::VPWidenCastSC:
4591       case VPDef::VPWidenGEPSC:
4592       case VPDef::VPWidenIntrinsicSC:
4593       case VPDef::VPWidenSC:
4594       case VPDef::VPWidenSelectSC:
4595       case VPDef::VPBlendSC:
4596       case VPDef::VPFirstOrderRecurrencePHISC:
4597       case VPDef::VPWidenPHISC:
4598       case VPDef::VPWidenIntOrFpInductionSC:
4599       case VPDef::VPWidenPointerInductionSC:
4600       case VPDef::VPReductionPHISC:
4601       case VPDef::VPInterleaveSC:
4602       case VPDef::VPWidenLoadEVLSC:
4603       case VPDef::VPWidenLoadSC:
4604       case VPDef::VPWidenStoreEVLSC:
4605       case VPDef::VPWidenStoreSC:
4606         break;
4607       default:
4608         llvm_unreachable("unhandled recipe");
4609       }
4610 
4611       auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4612         Type *VectorTy = toVectorTy(ScalarTy, VF);
4613         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4614         if (!NumLegalParts)
4615           return false;
4616         if (VF.isScalable()) {
4617           // <vscale x 1 x iN> is assumed to be profitable over iN because
4618           // scalable registers are a distinct register class from scalar
4619           // ones. If we ever find a target which wants to lower scalable
4620           // vectors back to scalars, we'll need to update this code to
4621           // explicitly ask TTI about the register class uses for each part.
4622           return NumLegalParts <= VF.getKnownMinValue();
4623         }
4624         // Two or more parts that share a register - are vectorized.
4625         return NumLegalParts < VF.getKnownMinValue();
4626       };
4627 
4628       // If no def nor is a store, e.g., branches, continue - no value to check.
4629       if (R.getNumDefinedValues() == 0 &&
4630           !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4631               &R))
4632         continue;
4633       // For multi-def recipes, currently only interleaved loads, suffice to
4634       // check first def only.
4635       // For stores check their stored value; for interleaved stores suffice
4636       // the check first stored value only. In all cases this is the second
4637       // operand.
4638       VPValue *ToCheck =
4639           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4640       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4641       if (!Visited.insert({ScalarTy}).second)
4642         continue;
4643       if (WillWiden(ScalarTy))
4644         return true;
4645     }
4646   }
4647 
4648   return false;
4649 }
4650 
4651 #ifndef NDEBUG
4652 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4653   InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4654   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4655   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4656   assert(any_of(VPlans,
4657                 [](std::unique_ptr<VPlan> &P) {
4658                   return P->hasVF(ElementCount::getFixed(1));
4659                 }) &&
4660          "Expected Scalar VF to be a candidate");
4661 
4662   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4663                                        ExpectedCost);
4664   VectorizationFactor ChosenFactor = ScalarCost;
4665 
4666   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4667   if (ForceVectorization &&
4668       (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4669     // Ignore scalar width, because the user explicitly wants vectorization.
4670     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4671     // evaluation.
4672     ChosenFactor.Cost = InstructionCost::getMax();
4673   }
4674 
4675   for (auto &P : VPlans) {
4676     for (ElementCount VF : P->vectorFactors()) {
4677       // The cost for scalar VF=1 is already calculated, so ignore it.
4678       if (VF.isScalar())
4679         continue;
4680 
4681       InstructionCost C = CM.expectedCost(VF);
4682       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4683 
4684       unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4685       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4686                         << " costs: " << (Candidate.Cost / Width));
4687       if (VF.isScalable())
4688         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4689                           << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4690                           << ")");
4691       LLVM_DEBUG(dbgs() << ".\n");
4692 
4693       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4694         LLVM_DEBUG(
4695             dbgs()
4696             << "LV: Not considering vector loop of width " << VF
4697             << " because it will not generate any vector instructions.\n");
4698         continue;
4699       }
4700 
4701       if (isMoreProfitable(Candidate, ChosenFactor))
4702         ChosenFactor = Candidate;
4703     }
4704   }
4705 
4706   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4707     reportVectorizationFailure(
4708         "There are conditional stores.",
4709         "store that is conditionally executed prevents vectorization",
4710         "ConditionalStore", ORE, OrigLoop);
4711     ChosenFactor = ScalarCost;
4712   }
4713 
4714   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4715                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4716              << "LV: Vectorization seems to be not beneficial, "
4717              << "but was forced by a user.\n");
4718   return ChosenFactor;
4719 }
4720 #endif
4721 
4722 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4723     ElementCount VF) const {
4724   // Cross iteration phis such as reductions need special handling and are
4725   // currently unsupported.
4726   if (any_of(OrigLoop->getHeader()->phis(),
4727              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4728     return false;
4729 
4730   // Phis with uses outside of the loop require special handling and are
4731   // currently unsupported.
4732   for (const auto &Entry : Legal->getInductionVars()) {
4733     // Look for uses of the value of the induction at the last iteration.
4734     Value *PostInc =
4735         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4736     for (User *U : PostInc->users())
4737       if (!OrigLoop->contains(cast<Instruction>(U)))
4738         return false;
4739     // Look for uses of penultimate value of the induction.
4740     for (User *U : Entry.first->users())
4741       if (!OrigLoop->contains(cast<Instruction>(U)))
4742         return false;
4743   }
4744 
4745   // Epilogue vectorization code has not been auditted to ensure it handles
4746   // non-latch exits properly.  It may be fine, but it needs auditted and
4747   // tested.
4748   // TODO: Add support for loops with an early exit.
4749   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4750     return false;
4751 
4752   return true;
4753 }
4754 
4755 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4756     const ElementCount VF, const unsigned IC) const {
4757   // FIXME: We need a much better cost-model to take different parameters such
4758   // as register pressure, code size increase and cost of extra branches into
4759   // account. For now we apply a very crude heuristic and only consider loops
4760   // with vectorization factors larger than a certain value.
4761 
4762   // Allow the target to opt out entirely.
4763   if (!TTI.preferEpilogueVectorization())
4764     return false;
4765 
4766   // We also consider epilogue vectorization unprofitable for targets that don't
4767   // consider interleaving beneficial (eg. MVE).
4768   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4769     return false;
4770 
4771   // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4772   // VFs when deciding profitability.
4773   // See related "TODO: extend to support scalable VFs." in
4774   // selectEpilogueVectorizationFactor.
4775   unsigned Multiplier = VF.isFixed() ? IC : 1;
4776   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4777                                 ? EpilogueVectorizationMinVF
4778                                 : TTI.getEpilogueVectorizationMinVF();
4779   return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4780 }
4781 
4782 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4783     const ElementCount MainLoopVF, unsigned IC) {
4784   VectorizationFactor Result = VectorizationFactor::Disabled();
4785   if (!EnableEpilogueVectorization) {
4786     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4787     return Result;
4788   }
4789 
4790   if (!CM.isScalarEpilogueAllowed()) {
4791     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4792                          "epilogue is allowed.\n");
4793     return Result;
4794   }
4795 
4796   // Not really a cost consideration, but check for unsupported cases here to
4797   // simplify the logic.
4798   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4799     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4800                          "is not a supported candidate.\n");
4801     return Result;
4802   }
4803 
4804   if (EpilogueVectorizationForceVF > 1) {
4805     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4806     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4807     if (hasPlanWithVF(ForcedEC))
4808       return {ForcedEC, 0, 0};
4809 
4810     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4811                          "viable.\n");
4812     return Result;
4813   }
4814 
4815   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4816       OrigLoop->getHeader()->getParent()->hasMinSize()) {
4817     LLVM_DEBUG(
4818         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4819     return Result;
4820   }
4821 
4822   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4823     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4824                          "this loop\n");
4825     return Result;
4826   }
4827 
4828   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4829   // the main loop handles 8 lanes per iteration. We could still benefit from
4830   // vectorizing the epilogue loop with VF=4.
4831   ElementCount EstimatedRuntimeVF =
4832       ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4833 
4834   ScalarEvolution &SE = *PSE.getSE();
4835   Type *TCType = Legal->getWidestInductionType();
4836   const SCEV *RemainingIterations = nullptr;
4837   unsigned MaxTripCount = 0;
4838   for (auto &NextVF : ProfitableVFs) {
4839     // Skip candidate VFs without a corresponding VPlan.
4840     if (!hasPlanWithVF(NextVF.Width))
4841       continue;
4842 
4843     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4844     // vectors) or > the VF of the main loop (fixed vectors).
4845     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4846          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4847         (NextVF.Width.isScalable() &&
4848          ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4849         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4850          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4851       continue;
4852 
4853     // If NextVF is greater than the number of remaining iterations, the
4854     // epilogue loop would be dead. Skip such factors.
4855     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4856       // TODO: extend to support scalable VFs.
4857       if (!RemainingIterations) {
4858         const SCEV *TC = vputils::getSCEVExprForVPValue(
4859             getPlanFor(NextVF.Width).getTripCount(), SE);
4860         assert(!isa<SCEVCouldNotCompute>(TC) &&
4861                "Trip count SCEV must be computable");
4862         RemainingIterations = SE.getURemExpr(
4863             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4864         MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4865         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4866                                 SE.getConstant(TCType, MaxTripCount))) {
4867           MaxTripCount =
4868               SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4869         }
4870         LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4871                           << MaxTripCount << "\n");
4872       }
4873       if (SE.isKnownPredicate(
4874               CmpInst::ICMP_UGT,
4875               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4876               RemainingIterations))
4877         continue;
4878     }
4879 
4880     if (Result.Width.isScalar() ||
4881         isMoreProfitable(NextVF, Result, MaxTripCount))
4882       Result = NextVF;
4883   }
4884 
4885   if (Result != VectorizationFactor::Disabled())
4886     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4887                       << Result.Width << "\n");
4888   return Result;
4889 }
4890 
4891 std::pair<unsigned, unsigned>
4892 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4893   unsigned MinWidth = -1U;
4894   unsigned MaxWidth = 8;
4895   const DataLayout &DL = TheFunction->getDataLayout();
4896   // For in-loop reductions, no element types are added to ElementTypesInLoop
4897   // if there are no loads/stores in the loop. In this case, check through the
4898   // reduction variables to determine the maximum width.
4899   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4900     // Reset MaxWidth so that we can find the smallest type used by recurrences
4901     // in the loop.
4902     MaxWidth = -1U;
4903     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4904       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4905       // When finding the min width used by the recurrence we need to account
4906       // for casts on the input operands of the recurrence.
4907       MaxWidth = std::min<unsigned>(
4908           MaxWidth, std::min<unsigned>(
4909                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4910                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4911     }
4912   } else {
4913     for (Type *T : ElementTypesInLoop) {
4914       MinWidth = std::min<unsigned>(
4915           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4916       MaxWidth = std::max<unsigned>(
4917           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4918     }
4919   }
4920   return {MinWidth, MaxWidth};
4921 }
4922 
4923 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4924   ElementTypesInLoop.clear();
4925   // For each block.
4926   for (BasicBlock *BB : TheLoop->blocks()) {
4927     // For each instruction in the loop.
4928     for (Instruction &I : BB->instructionsWithoutDebug()) {
4929       Type *T = I.getType();
4930 
4931       // Skip ignored values.
4932       if (ValuesToIgnore.count(&I))
4933         continue;
4934 
4935       // Only examine Loads, Stores and PHINodes.
4936       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4937         continue;
4938 
4939       // Examine PHI nodes that are reduction variables. Update the type to
4940       // account for the recurrence type.
4941       if (auto *PN = dyn_cast<PHINode>(&I)) {
4942         if (!Legal->isReductionVariable(PN))
4943           continue;
4944         const RecurrenceDescriptor &RdxDesc =
4945             Legal->getReductionVars().find(PN)->second;
4946         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4947             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
4948                                       RdxDesc.getRecurrenceType(),
4949                                       TargetTransformInfo::ReductionFlags()))
4950           continue;
4951         T = RdxDesc.getRecurrenceType();
4952       }
4953 
4954       // Examine the stored values.
4955       if (auto *ST = dyn_cast<StoreInst>(&I))
4956         T = ST->getValueOperand()->getType();
4957 
4958       assert(T->isSized() &&
4959              "Expected the load/store/recurrence type to be sized");
4960 
4961       ElementTypesInLoop.insert(T);
4962     }
4963   }
4964 }
4965 
4966 unsigned
4967 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4968                                                   InstructionCost LoopCost) {
4969   // -- The interleave heuristics --
4970   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4971   // There are many micro-architectural considerations that we can't predict
4972   // at this level. For example, frontend pressure (on decode or fetch) due to
4973   // code size, or the number and capabilities of the execution ports.
4974   //
4975   // We use the following heuristics to select the interleave count:
4976   // 1. If the code has reductions, then we interleave to break the cross
4977   // iteration dependency.
4978   // 2. If the loop is really small, then we interleave to reduce the loop
4979   // overhead.
4980   // 3. We don't interleave if we think that we will spill registers to memory
4981   // due to the increased register pressure.
4982 
4983   if (!isScalarEpilogueAllowed())
4984     return 1;
4985 
4986   // Do not interleave if EVL is preferred and no User IC is specified.
4987   if (foldTailWithEVL()) {
4988     LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4989                          "Unroll factor forced to be 1.\n");
4990     return 1;
4991   }
4992 
4993   // We used the distance for the interleave count.
4994   if (!Legal->isSafeForAnyVectorWidth())
4995     return 1;
4996 
4997   // We don't attempt to perform interleaving for loops with uncountable early
4998   // exits because the VPInstruction::AnyOf code cannot currently handle
4999   // multiple parts.
5000   if (Legal->hasUncountableEarlyExit())
5001     return 1;
5002 
5003   auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
5004   const bool HasReductions = !Legal->getReductionVars().empty();
5005 
5006   // If we did not calculate the cost for VF (because the user selected the VF)
5007   // then we calculate the cost of VF here.
5008   if (LoopCost == 0) {
5009     LoopCost = expectedCost(VF);
5010     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5011 
5012     // Loop body is free and there is no need for interleaving.
5013     if (LoopCost == 0)
5014       return 1;
5015   }
5016 
5017   RegisterUsage R = calculateRegisterUsage({VF})[0];
5018   // We divide by these constants so assume that we have at least one
5019   // instruction that uses at least one register.
5020   for (auto &Pair : R.MaxLocalUsers) {
5021     Pair.second = std::max(Pair.second, 1U);
5022   }
5023 
5024   // We calculate the interleave count using the following formula.
5025   // Subtract the number of loop invariants from the number of available
5026   // registers. These registers are used by all of the interleaved instances.
5027   // Next, divide the remaining registers by the number of registers that is
5028   // required by the loop, in order to estimate how many parallel instances
5029   // fit without causing spills. All of this is rounded down if necessary to be
5030   // a power of two. We want power of two interleave count to simplify any
5031   // addressing operations or alignment considerations.
5032   // We also want power of two interleave counts to ensure that the induction
5033   // variable of the vector loop wraps to zero, when tail is folded by masking;
5034   // this currently happens when OptForSize, in which case IC is set to 1 above.
5035   unsigned IC = UINT_MAX;
5036 
5037   for (const auto &Pair : R.MaxLocalUsers) {
5038     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
5039     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5040                       << " registers of "
5041                       << TTI.getRegisterClassName(Pair.first)
5042                       << " register class\n");
5043     if (VF.isScalar()) {
5044       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5045         TargetNumRegisters = ForceTargetNumScalarRegs;
5046     } else {
5047       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5048         TargetNumRegisters = ForceTargetNumVectorRegs;
5049     }
5050     unsigned MaxLocalUsers = Pair.second;
5051     unsigned LoopInvariantRegs = 0;
5052     if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
5053       LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
5054 
5055     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5056                                      MaxLocalUsers);
5057     // Don't count the induction variable as interleaved.
5058     if (EnableIndVarRegisterHeur) {
5059       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5060                               std::max(1U, (MaxLocalUsers - 1)));
5061     }
5062 
5063     IC = std::min(IC, TmpIC);
5064   }
5065 
5066   // Clamp the interleave ranges to reasonable counts.
5067   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5068 
5069   // Check if the user has overridden the max.
5070   if (VF.isScalar()) {
5071     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5072       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5073   } else {
5074     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5075       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5076   }
5077 
5078   unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
5079   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5080   if (KnownTC > 0) {
5081     // At least one iteration must be scalar when this constraint holds. So the
5082     // maximum available iterations for interleaving is one less.
5083     unsigned AvailableTC =
5084         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5085 
5086     // If trip count is known we select between two prospective ICs, where
5087     // 1) the aggressive IC is capped by the trip count divided by VF
5088     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5089     // The final IC is selected in a way that the epilogue loop trip count is
5090     // minimized while maximizing the IC itself, so that we either run the
5091     // vector loop at least once if it generates a small epilogue loop, or else
5092     // we run the vector loop at least twice.
5093 
5094     unsigned InterleaveCountUB = bit_floor(
5095         std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5096     unsigned InterleaveCountLB = bit_floor(std::max(
5097         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5098     MaxInterleaveCount = InterleaveCountLB;
5099 
5100     if (InterleaveCountUB != InterleaveCountLB) {
5101       unsigned TailTripCountUB =
5102           (AvailableTC % (EstimatedVF * InterleaveCountUB));
5103       unsigned TailTripCountLB =
5104           (AvailableTC % (EstimatedVF * InterleaveCountLB));
5105       // If both produce same scalar tail, maximize the IC to do the same work
5106       // in fewer vector loop iterations
5107       if (TailTripCountUB == TailTripCountLB)
5108         MaxInterleaveCount = InterleaveCountUB;
5109     }
5110   } else if (BestKnownTC && *BestKnownTC > 0) {
5111     // At least one iteration must be scalar when this constraint holds. So the
5112     // maximum available iterations for interleaving is one less.
5113     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5114                                ? (*BestKnownTC) - 1
5115                                : *BestKnownTC;
5116 
5117     // If trip count is an estimated compile time constant, limit the
5118     // IC to be capped by the trip count divided by VF * 2, such that the vector
5119     // loop runs at least twice to make interleaving seem profitable when there
5120     // is an epilogue loop present. Since exact Trip count is not known we
5121     // choose to be conservative in our IC estimate.
5122     MaxInterleaveCount = bit_floor(std::max(
5123         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5124   }
5125 
5126   assert(MaxInterleaveCount > 0 &&
5127          "Maximum interleave count must be greater than 0");
5128 
5129   // Clamp the calculated IC to be between the 1 and the max interleave count
5130   // that the target and trip count allows.
5131   if (IC > MaxInterleaveCount)
5132     IC = MaxInterleaveCount;
5133   else
5134     // Make sure IC is greater than 0.
5135     IC = std::max(1u, IC);
5136 
5137   assert(IC > 0 && "Interleave count must be greater than 0.");
5138 
5139   // Interleave if we vectorized this loop and there is a reduction that could
5140   // benefit from interleaving.
5141   if (VF.isVector() && HasReductions) {
5142     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5143     return IC;
5144   }
5145 
5146   // For any scalar loop that either requires runtime checks or predication we
5147   // are better off leaving this to the unroller. Note that if we've already
5148   // vectorized the loop we will have done the runtime check and so interleaving
5149   // won't require further checks.
5150   bool ScalarInterleavingRequiresPredication =
5151       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5152          return Legal->blockNeedsPredication(BB);
5153        }));
5154   bool ScalarInterleavingRequiresRuntimePointerCheck =
5155       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5156 
5157   // We want to interleave small loops in order to reduce the loop overhead and
5158   // potentially expose ILP opportunities.
5159   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5160                     << "LV: IC is " << IC << '\n'
5161                     << "LV: VF is " << VF << '\n');
5162   const bool AggressivelyInterleaveReductions =
5163       TTI.enableAggressiveInterleaving(HasReductions);
5164   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5165       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5166     // We assume that the cost overhead is 1 and we use the cost model
5167     // to estimate the cost of the loop and interleave until the cost of the
5168     // loop overhead is about 5% of the cost of the loop.
5169     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5170                                         SmallLoopCost / *LoopCost.getValue()));
5171 
5172     // Interleave until store/load ports (estimated by max interleave count) are
5173     // saturated.
5174     unsigned NumStores = Legal->getNumStores();
5175     unsigned NumLoads = Legal->getNumLoads();
5176     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5177     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5178 
5179     // There is little point in interleaving for reductions containing selects
5180     // and compares when VF=1 since it may just create more overhead than it's
5181     // worth for loops with small trip counts. This is because we still have to
5182     // do the final reduction after the loop.
5183     bool HasSelectCmpReductions =
5184         HasReductions &&
5185         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5186           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5187           RecurKind RK = RdxDesc.getRecurrenceKind();
5188           return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5189                  RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5190         });
5191     if (HasSelectCmpReductions) {
5192       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5193       return 1;
5194     }
5195 
5196     // If we have a scalar reduction (vector reductions are already dealt with
5197     // by this point), we can increase the critical path length if the loop
5198     // we're interleaving is inside another loop. For tree-wise reductions
5199     // set the limit to 2, and for ordered reductions it's best to disable
5200     // interleaving entirely.
5201     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5202       bool HasOrderedReductions =
5203           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5204             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5205             return RdxDesc.isOrdered();
5206           });
5207       if (HasOrderedReductions) {
5208         LLVM_DEBUG(
5209             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5210         return 1;
5211       }
5212 
5213       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5214       SmallIC = std::min(SmallIC, F);
5215       StoresIC = std::min(StoresIC, F);
5216       LoadsIC = std::min(LoadsIC, F);
5217     }
5218 
5219     if (EnableLoadStoreRuntimeInterleave &&
5220         std::max(StoresIC, LoadsIC) > SmallIC) {
5221       LLVM_DEBUG(
5222           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5223       return std::max(StoresIC, LoadsIC);
5224     }
5225 
5226     // If there are scalar reductions and TTI has enabled aggressive
5227     // interleaving for reductions, we will interleave to expose ILP.
5228     if (VF.isScalar() && AggressivelyInterleaveReductions) {
5229       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5230       // Interleave no less than SmallIC but not as aggressive as the normal IC
5231       // to satisfy the rare situation when resources are too limited.
5232       return std::max(IC / 2, SmallIC);
5233     }
5234 
5235     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5236     return SmallIC;
5237   }
5238 
5239   // Interleave if this is a large loop (small loops are already dealt with by
5240   // this point) that could benefit from interleaving.
5241   if (AggressivelyInterleaveReductions) {
5242     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5243     return IC;
5244   }
5245 
5246   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5247   return 1;
5248 }
5249 
5250 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5251 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5252   // This function calculates the register usage by measuring the highest number
5253   // of values that are alive at a single location. Obviously, this is a very
5254   // rough estimation. We scan the loop in a topological order in order and
5255   // assign a number to each instruction. We use RPO to ensure that defs are
5256   // met before their users. We assume that each instruction that has in-loop
5257   // users starts an interval. We record every time that an in-loop value is
5258   // used, so we have a list of the first and last occurrences of each
5259   // instruction. Next, we transpose this data structure into a multi map that
5260   // holds the list of intervals that *end* at a specific location. This multi
5261   // map allows us to perform a linear search. We scan the instructions linearly
5262   // and record each time that a new interval starts, by placing it in a set.
5263   // If we find this value in the multi-map then we remove it from the set.
5264   // The max register usage is the maximum size of the set.
5265   // We also search for instructions that are defined outside the loop, but are
5266   // used inside the loop. We need this number separately from the max-interval
5267   // usage number because when we unroll, loop-invariant values do not take
5268   // more register.
5269   LoopBlocksDFS DFS(TheLoop);
5270   DFS.perform(LI);
5271 
5272   RegisterUsage RU;
5273 
5274   // Each 'key' in the map opens a new interval. The values
5275   // of the map are the index of the 'last seen' usage of the
5276   // instruction that is the key.
5277   using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5278 
5279   // Maps instruction to its index.
5280   SmallVector<Instruction *, 64> IdxToInstr;
5281   // Marks the end of each interval.
5282   IntervalMap EndPoint;
5283   // Saves the list of instruction indices that are used in the loop.
5284   SmallPtrSet<Instruction *, 8> Ends;
5285   // Saves the list of values that are used in the loop but are defined outside
5286   // the loop (not including non-instruction values such as arguments and
5287   // constants).
5288   SmallSetVector<Instruction *, 8> LoopInvariants;
5289 
5290   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5291     for (Instruction &I : BB->instructionsWithoutDebug()) {
5292       IdxToInstr.push_back(&I);
5293 
5294       // Save the end location of each USE.
5295       for (Value *U : I.operands()) {
5296         auto *Instr = dyn_cast<Instruction>(U);
5297 
5298         // Ignore non-instruction values such as arguments, constants, etc.
5299         // FIXME: Might need some motivation why these values are ignored. If
5300         // for example an argument is used inside the loop it will increase the
5301         // register pressure (so shouldn't we add it to LoopInvariants).
5302         if (!Instr)
5303           continue;
5304 
5305         // If this instruction is outside the loop then record it and continue.
5306         if (!TheLoop->contains(Instr)) {
5307           LoopInvariants.insert(Instr);
5308           continue;
5309         }
5310 
5311         // Overwrite previous end points.
5312         EndPoint[Instr] = IdxToInstr.size();
5313         Ends.insert(Instr);
5314       }
5315     }
5316   }
5317 
5318   // Saves the list of intervals that end with the index in 'key'.
5319   using InstrList = SmallVector<Instruction *, 2>;
5320   SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5321 
5322   // Transpose the EndPoints to a list of values that end at each index.
5323   for (auto &Interval : EndPoint)
5324     TransposeEnds[Interval.second].push_back(Interval.first);
5325 
5326   SmallPtrSet<Instruction *, 8> OpenIntervals;
5327   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5328   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5329 
5330   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5331 
5332   const auto &TTICapture = TTI;
5333   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5334     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5335         (VF.isScalable() &&
5336          !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5337       return 0;
5338     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5339   };
5340 
5341   for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5342     Instruction *I = IdxToInstr[Idx];
5343 
5344     // Remove all of the instructions that end at this location.
5345     InstrList &List = TransposeEnds[Idx];
5346     for (Instruction *ToRemove : List)
5347       OpenIntervals.erase(ToRemove);
5348 
5349     // Ignore instructions that are never used within the loop.
5350     if (!Ends.count(I))
5351       continue;
5352 
5353     // Skip ignored values.
5354     if (ValuesToIgnore.count(I))
5355       continue;
5356 
5357     collectInLoopReductions();
5358 
5359     // For each VF find the maximum usage of registers.
5360     for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5361       // Count the number of registers used, per register class, given all open
5362       // intervals.
5363       // Note that elements in this SmallMapVector will be default constructed
5364       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5365       // there is no previous entry for ClassID.
5366       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5367 
5368       if (VFs[J].isScalar()) {
5369         for (auto *Inst : OpenIntervals) {
5370           unsigned ClassID =
5371               TTI.getRegisterClassForType(false, Inst->getType());
5372           // FIXME: The target might use more than one register for the type
5373           // even in the scalar case.
5374           RegUsage[ClassID] += 1;
5375         }
5376       } else {
5377         collectUniformsAndScalars(VFs[J]);
5378         for (auto *Inst : OpenIntervals) {
5379           // Skip ignored values for VF > 1.
5380           if (VecValuesToIgnore.count(Inst))
5381             continue;
5382           if (isScalarAfterVectorization(Inst, VFs[J])) {
5383             unsigned ClassID =
5384                 TTI.getRegisterClassForType(false, Inst->getType());
5385             // FIXME: The target might use more than one register for the type
5386             // even in the scalar case.
5387             RegUsage[ClassID] += 1;
5388           } else {
5389             unsigned ClassID =
5390                 TTI.getRegisterClassForType(true, Inst->getType());
5391             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5392           }
5393         }
5394       }
5395 
5396       for (const auto &Pair : RegUsage) {
5397         auto &Entry = MaxUsages[J][Pair.first];
5398         Entry = std::max(Entry, Pair.second);
5399       }
5400     }
5401 
5402     LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5403                       << OpenIntervals.size() << '\n');
5404 
5405     // Add the current instruction to the list of open intervals.
5406     OpenIntervals.insert(I);
5407   }
5408 
5409   for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5410     // Note that elements in this SmallMapVector will be default constructed
5411     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5412     // there is no previous entry for ClassID.
5413     SmallMapVector<unsigned, unsigned, 4> Invariant;
5414 
5415     for (auto *Inst : LoopInvariants) {
5416       // FIXME: The target might use more than one register for the type
5417       // even in the scalar case.
5418       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5419         auto *I = cast<Instruction>(U);
5420         return TheLoop != LI->getLoopFor(I->getParent()) ||
5421                isScalarAfterVectorization(I, VFs[Idx]);
5422       });
5423 
5424       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5425       unsigned ClassID =
5426           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5427       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5428     }
5429 
5430     LLVM_DEBUG({
5431       dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5432       dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5433              << " item\n";
5434       for (const auto &pair : MaxUsages[Idx]) {
5435         dbgs() << "LV(REG): RegisterClass: "
5436                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5437                << " registers\n";
5438       }
5439       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5440              << " item\n";
5441       for (const auto &pair : Invariant) {
5442         dbgs() << "LV(REG): RegisterClass: "
5443                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5444                << " registers\n";
5445       }
5446     });
5447 
5448     RU.LoopInvariantRegs = Invariant;
5449     RU.MaxLocalUsers = MaxUsages[Idx];
5450     RUs[Idx] = RU;
5451   }
5452 
5453   return RUs;
5454 }
5455 
5456 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5457                                                            ElementCount VF) {
5458   // TODO: Cost model for emulated masked load/store is completely
5459   // broken. This hack guides the cost model to use an artificially
5460   // high enough value to practically disable vectorization with such
5461   // operations, except where previously deployed legality hack allowed
5462   // using very low cost values. This is to avoid regressions coming simply
5463   // from moving "masked load/store" check from legality to cost model.
5464   // Masked Load/Gather emulation was previously never allowed.
5465   // Limited number of Masked Store/Scatter emulation was allowed.
5466   assert((isPredicatedInst(I)) &&
5467          "Expecting a scalar emulated instruction");
5468   return isa<LoadInst>(I) ||
5469          (isa<StoreInst>(I) &&
5470           NumPredStores > NumberOfStoresToPredicate);
5471 }
5472 
5473 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5474   // If we aren't vectorizing the loop, or if we've already collected the
5475   // instructions to scalarize, there's nothing to do. Collection may already
5476   // have occurred if we have a user-selected VF and are now computing the
5477   // expected cost for interleaving.
5478   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5479     return;
5480 
5481   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5482   // not profitable to scalarize any instructions, the presence of VF in the
5483   // map will indicate that we've analyzed it already.
5484   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5485 
5486   PredicatedBBsAfterVectorization[VF].clear();
5487 
5488   // Find all the instructions that are scalar with predication in the loop and
5489   // determine if it would be better to not if-convert the blocks they are in.
5490   // If so, we also record the instructions to scalarize.
5491   for (BasicBlock *BB : TheLoop->blocks()) {
5492     if (!blockNeedsPredicationForAnyReason(BB))
5493       continue;
5494     for (Instruction &I : *BB)
5495       if (isScalarWithPredication(&I, VF)) {
5496         ScalarCostsTy ScalarCosts;
5497         // Do not apply discount logic for:
5498         // 1. Scalars after vectorization, as there will only be a single copy
5499         // of the instruction.
5500         // 2. Scalable VF, as that would lead to invalid scalarization costs.
5501         // 3. Emulated masked memrefs, if a hacked cost is needed.
5502         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5503             !useEmulatedMaskMemRefHack(&I, VF) &&
5504             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5505           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5506           // Check if we decided to scalarize a call. If so, update the widening
5507           // decision of the call to CM_Scalarize with the computed scalar cost.
5508           for (const auto &[I, _] : ScalarCosts) {
5509             auto *CI = dyn_cast<CallInst>(I);
5510             if (!CI || !CallWideningDecisions.contains({CI, VF}))
5511               continue;
5512             CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5513             CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5514           }
5515         }
5516         // Remember that BB will remain after vectorization.
5517         PredicatedBBsAfterVectorization[VF].insert(BB);
5518         for (auto *Pred : predecessors(BB)) {
5519           if (Pred->getSingleSuccessor() == BB)
5520             PredicatedBBsAfterVectorization[VF].insert(Pred);
5521         }
5522       }
5523   }
5524 }
5525 
5526 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5527     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5528   assert(!isUniformAfterVectorization(PredInst, VF) &&
5529          "Instruction marked uniform-after-vectorization will be predicated");
5530 
5531   // Initialize the discount to zero, meaning that the scalar version and the
5532   // vector version cost the same.
5533   InstructionCost Discount = 0;
5534 
5535   // Holds instructions to analyze. The instructions we visit are mapped in
5536   // ScalarCosts. Those instructions are the ones that would be scalarized if
5537   // we find that the scalar version costs less.
5538   SmallVector<Instruction *, 8> Worklist;
5539 
5540   // Returns true if the given instruction can be scalarized.
5541   auto CanBeScalarized = [&](Instruction *I) -> bool {
5542     // We only attempt to scalarize instructions forming a single-use chain
5543     // from the original predicated block that would otherwise be vectorized.
5544     // Although not strictly necessary, we give up on instructions we know will
5545     // already be scalar to avoid traversing chains that are unlikely to be
5546     // beneficial.
5547     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5548         isScalarAfterVectorization(I, VF))
5549       return false;
5550 
5551     // If the instruction is scalar with predication, it will be analyzed
5552     // separately. We ignore it within the context of PredInst.
5553     if (isScalarWithPredication(I, VF))
5554       return false;
5555 
5556     // If any of the instruction's operands are uniform after vectorization,
5557     // the instruction cannot be scalarized. This prevents, for example, a
5558     // masked load from being scalarized.
5559     //
5560     // We assume we will only emit a value for lane zero of an instruction
5561     // marked uniform after vectorization, rather than VF identical values.
5562     // Thus, if we scalarize an instruction that uses a uniform, we would
5563     // create uses of values corresponding to the lanes we aren't emitting code
5564     // for. This behavior can be changed by allowing getScalarValue to clone
5565     // the lane zero values for uniforms rather than asserting.
5566     for (Use &U : I->operands())
5567       if (auto *J = dyn_cast<Instruction>(U.get()))
5568         if (isUniformAfterVectorization(J, VF))
5569           return false;
5570 
5571     // Otherwise, we can scalarize the instruction.
5572     return true;
5573   };
5574 
5575   // Compute the expected cost discount from scalarizing the entire expression
5576   // feeding the predicated instruction. We currently only consider expressions
5577   // that are single-use instruction chains.
5578   Worklist.push_back(PredInst);
5579   while (!Worklist.empty()) {
5580     Instruction *I = Worklist.pop_back_val();
5581 
5582     // If we've already analyzed the instruction, there's nothing to do.
5583     if (ScalarCosts.contains(I))
5584       continue;
5585 
5586     // Compute the cost of the vector instruction. Note that this cost already
5587     // includes the scalarization overhead of the predicated instruction.
5588     InstructionCost VectorCost = getInstructionCost(I, VF);
5589 
5590     // Compute the cost of the scalarized instruction. This cost is the cost of
5591     // the instruction as if it wasn't if-converted and instead remained in the
5592     // predicated block. We will scale this cost by block probability after
5593     // computing the scalarization overhead.
5594     InstructionCost ScalarCost =
5595         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5596 
5597     // Compute the scalarization overhead of needed insertelement instructions
5598     // and phi nodes.
5599     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5600       ScalarCost += TTI.getScalarizationOverhead(
5601           cast<VectorType>(toVectorTy(I->getType(), VF)),
5602           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5603           /*Extract*/ false, CostKind);
5604       ScalarCost +=
5605           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5606     }
5607 
5608     // Compute the scalarization overhead of needed extractelement
5609     // instructions. For each of the instruction's operands, if the operand can
5610     // be scalarized, add it to the worklist; otherwise, account for the
5611     // overhead.
5612     for (Use &U : I->operands())
5613       if (auto *J = dyn_cast<Instruction>(U.get())) {
5614         assert(VectorType::isValidElementType(J->getType()) &&
5615                "Instruction has non-scalar type");
5616         if (CanBeScalarized(J))
5617           Worklist.push_back(J);
5618         else if (needsExtract(J, VF)) {
5619           ScalarCost += TTI.getScalarizationOverhead(
5620               cast<VectorType>(toVectorTy(J->getType(), VF)),
5621               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5622               /*Extract*/ true, CostKind);
5623         }
5624       }
5625 
5626     // Scale the total scalar cost by block probability.
5627     ScalarCost /= getReciprocalPredBlockProb();
5628 
5629     // Compute the discount. A non-negative discount means the vector version
5630     // of the instruction costs more, and scalarizing would be beneficial.
5631     Discount += VectorCost - ScalarCost;
5632     ScalarCosts[I] = ScalarCost;
5633   }
5634 
5635   return Discount;
5636 }
5637 
5638 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5639   InstructionCost Cost;
5640 
5641   // If the vector loop gets executed exactly once with the given VF, ignore the
5642   // costs of comparison and induction instructions, as they'll get simplified
5643   // away.
5644   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5645   auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5646   if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5647     addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5648                                          ValuesToIgnoreForVF);
5649 
5650   // For each block.
5651   for (BasicBlock *BB : TheLoop->blocks()) {
5652     InstructionCost BlockCost;
5653 
5654     // For each instruction in the old loop.
5655     for (Instruction &I : BB->instructionsWithoutDebug()) {
5656       // Skip ignored values.
5657       if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5658           (VF.isVector() && VecValuesToIgnore.count(&I)))
5659         continue;
5660 
5661       InstructionCost C = getInstructionCost(&I, VF);
5662 
5663       // Check if we should override the cost.
5664       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5665         C = InstructionCost(ForceTargetInstructionCost);
5666 
5667       BlockCost += C;
5668       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5669                         << VF << " For instruction: " << I << '\n');
5670     }
5671 
5672     // If we are vectorizing a predicated block, it will have been
5673     // if-converted. This means that the block's instructions (aside from
5674     // stores and instructions that may divide by zero) will now be
5675     // unconditionally executed. For the scalar case, we may not always execute
5676     // the predicated block, if it is an if-else block. Thus, scale the block's
5677     // cost by the probability of executing it. blockNeedsPredication from
5678     // Legal is used so as to not include all blocks in tail folded loops.
5679     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5680       BlockCost /= getReciprocalPredBlockProb();
5681 
5682     Cost += BlockCost;
5683   }
5684 
5685   return Cost;
5686 }
5687 
5688 /// Gets Address Access SCEV after verifying that the access pattern
5689 /// is loop invariant except the induction variable dependence.
5690 ///
5691 /// This SCEV can be sent to the Target in order to estimate the address
5692 /// calculation cost.
5693 static const SCEV *getAddressAccessSCEV(
5694               Value *Ptr,
5695               LoopVectorizationLegality *Legal,
5696               PredicatedScalarEvolution &PSE,
5697               const Loop *TheLoop) {
5698 
5699   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5700   if (!Gep)
5701     return nullptr;
5702 
5703   // We are looking for a gep with all loop invariant indices except for one
5704   // which should be an induction variable.
5705   auto *SE = PSE.getSE();
5706   unsigned NumOperands = Gep->getNumOperands();
5707   for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5708     Value *Opd = Gep->getOperand(Idx);
5709     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5710         !Legal->isInductionVariable(Opd))
5711       return nullptr;
5712   }
5713 
5714   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5715   return PSE.getSCEV(Ptr);
5716 }
5717 
5718 InstructionCost
5719 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5720                                                         ElementCount VF) {
5721   assert(VF.isVector() &&
5722          "Scalarization cost of instruction implies vectorization.");
5723   if (VF.isScalable())
5724     return InstructionCost::getInvalid();
5725 
5726   Type *ValTy = getLoadStoreType(I);
5727   auto *SE = PSE.getSE();
5728 
5729   unsigned AS = getLoadStoreAddressSpace(I);
5730   Value *Ptr = getLoadStorePointerOperand(I);
5731   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5732   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5733   //       that it is being called from this specific place.
5734 
5735   // Figure out whether the access is strided and get the stride value
5736   // if it's known in compile time
5737   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5738 
5739   // Get the cost of the scalar memory instruction and address computation.
5740   InstructionCost Cost =
5741       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5742 
5743   // Don't pass *I here, since it is scalar but will actually be part of a
5744   // vectorized loop where the user of it is a vectorized instruction.
5745   const Align Alignment = getLoadStoreAlignment(I);
5746   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5747                                                       ValTy->getScalarType(),
5748                                                       Alignment, AS, CostKind);
5749 
5750   // Get the overhead of the extractelement and insertelement instructions
5751   // we might create due to scalarization.
5752   Cost += getScalarizationOverhead(I, VF);
5753 
5754   // If we have a predicated load/store, it will need extra i1 extracts and
5755   // conditional branches, but may not be executed for each vector lane. Scale
5756   // the cost by the probability of executing the predicated block.
5757   if (isPredicatedInst(I)) {
5758     Cost /= getReciprocalPredBlockProb();
5759 
5760     // Add the cost of an i1 extract and a branch
5761     auto *VecI1Ty =
5762         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5763     Cost += TTI.getScalarizationOverhead(
5764         VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5765         /*Insert=*/false, /*Extract=*/true, CostKind);
5766     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5767 
5768     if (useEmulatedMaskMemRefHack(I, VF))
5769       // Artificially setting to a high enough value to practically disable
5770       // vectorization with such operations.
5771       Cost = 3000000;
5772   }
5773 
5774   return Cost;
5775 }
5776 
5777 InstructionCost
5778 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5779                                                     ElementCount VF) {
5780   Type *ValTy = getLoadStoreType(I);
5781   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5782   Value *Ptr = getLoadStorePointerOperand(I);
5783   unsigned AS = getLoadStoreAddressSpace(I);
5784   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5785 
5786   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5787          "Stride should be 1 or -1 for consecutive memory access");
5788   const Align Alignment = getLoadStoreAlignment(I);
5789   InstructionCost Cost = 0;
5790   if (Legal->isMaskRequired(I)) {
5791     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5792                                       CostKind);
5793   } else {
5794     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5795     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5796                                 CostKind, OpInfo, I);
5797   }
5798 
5799   bool Reverse = ConsecutiveStride < 0;
5800   if (Reverse)
5801     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5802                                CostKind, 0);
5803   return Cost;
5804 }
5805 
5806 InstructionCost
5807 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5808                                                 ElementCount VF) {
5809   assert(Legal->isUniformMemOp(*I, VF));
5810 
5811   Type *ValTy = getLoadStoreType(I);
5812   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5813   const Align Alignment = getLoadStoreAlignment(I);
5814   unsigned AS = getLoadStoreAddressSpace(I);
5815   if (isa<LoadInst>(I)) {
5816     return TTI.getAddressComputationCost(ValTy) +
5817            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5818                                CostKind) +
5819            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
5820                               CostKind);
5821   }
5822   StoreInst *SI = cast<StoreInst>(I);
5823 
5824   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5825   return TTI.getAddressComputationCost(ValTy) +
5826          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5827                              CostKind) +
5828          (IsLoopInvariantStoreValue
5829               ? 0
5830               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5831                                        CostKind, VF.getKnownMinValue() - 1));
5832 }
5833 
5834 InstructionCost
5835 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5836                                                  ElementCount VF) {
5837   Type *ValTy = getLoadStoreType(I);
5838   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5839   const Align Alignment = getLoadStoreAlignment(I);
5840   const Value *Ptr = getLoadStorePointerOperand(I);
5841 
5842   return TTI.getAddressComputationCost(VectorTy) +
5843          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5844                                     Legal->isMaskRequired(I), Alignment,
5845                                     CostKind, I);
5846 }
5847 
5848 InstructionCost
5849 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5850                                                    ElementCount VF) {
5851   const auto *Group = getInterleavedAccessGroup(I);
5852   assert(Group && "Fail to get an interleaved access group.");
5853 
5854   Instruction *InsertPos = Group->getInsertPos();
5855   Type *ValTy = getLoadStoreType(InsertPos);
5856   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5857   unsigned AS = getLoadStoreAddressSpace(InsertPos);
5858 
5859   unsigned InterleaveFactor = Group->getFactor();
5860   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5861 
5862   // Holds the indices of existing members in the interleaved group.
5863   SmallVector<unsigned, 4> Indices;
5864   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5865     if (Group->getMember(IF))
5866       Indices.push_back(IF);
5867 
5868   // Calculate the cost of the whole interleaved group.
5869   bool UseMaskForGaps =
5870       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5871       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5872   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5873       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5874       Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5875       UseMaskForGaps);
5876 
5877   if (Group->isReverse()) {
5878     // TODO: Add support for reversed masked interleaved access.
5879     assert(!Legal->isMaskRequired(I) &&
5880            "Reverse masked interleaved access not supported.");
5881     Cost += Group->getNumMembers() *
5882             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5883                                CostKind, 0);
5884   }
5885   return Cost;
5886 }
5887 
5888 std::optional<InstructionCost>
5889 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5890                                                     ElementCount VF,
5891                                                     Type *Ty) const {
5892   using namespace llvm::PatternMatch;
5893   // Early exit for no inloop reductions
5894   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5895     return std::nullopt;
5896   auto *VectorTy = cast<VectorType>(Ty);
5897 
5898   // We are looking for a pattern of, and finding the minimal acceptable cost:
5899   //  reduce(mul(ext(A), ext(B))) or
5900   //  reduce(mul(A, B)) or
5901   //  reduce(ext(A)) or
5902   //  reduce(A).
5903   // The basic idea is that we walk down the tree to do that, finding the root
5904   // reduction instruction in InLoopReductionImmediateChains. From there we find
5905   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5906   // of the components. If the reduction cost is lower then we return it for the
5907   // reduction instruction and 0 for the other instructions in the pattern. If
5908   // it is not we return an invalid cost specifying the orignal cost method
5909   // should be used.
5910   Instruction *RetI = I;
5911   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5912     if (!RetI->hasOneUser())
5913       return std::nullopt;
5914     RetI = RetI->user_back();
5915   }
5916 
5917   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5918       RetI->user_back()->getOpcode() == Instruction::Add) {
5919     RetI = RetI->user_back();
5920   }
5921 
5922   // Test if the found instruction is a reduction, and if not return an invalid
5923   // cost specifying the parent to use the original cost modelling.
5924   if (!InLoopReductionImmediateChains.count(RetI))
5925     return std::nullopt;
5926 
5927   // Find the reduction this chain is a part of and calculate the basic cost of
5928   // the reduction on its own.
5929   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5930   Instruction *ReductionPhi = LastChain;
5931   while (!isa<PHINode>(ReductionPhi))
5932     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5933 
5934   const RecurrenceDescriptor &RdxDesc =
5935       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5936 
5937   InstructionCost BaseCost;
5938   RecurKind RK = RdxDesc.getRecurrenceKind();
5939   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5940     Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5941     BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5942                                           RdxDesc.getFastMathFlags(), CostKind);
5943   } else {
5944     BaseCost = TTI.getArithmeticReductionCost(
5945         RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5946   }
5947 
5948   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5949   // normal fmul instruction to the cost of the fadd reduction.
5950   if (RK == RecurKind::FMulAdd)
5951     BaseCost +=
5952         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5953 
5954   // If we're using ordered reductions then we can just return the base cost
5955   // here, since getArithmeticReductionCost calculates the full ordered
5956   // reduction cost when FP reassociation is not allowed.
5957   if (useOrderedReductions(RdxDesc))
5958     return BaseCost;
5959 
5960   // Get the operand that was not the reduction chain and match it to one of the
5961   // patterns, returning the better cost if it is found.
5962   Instruction *RedOp = RetI->getOperand(1) == LastChain
5963                            ? dyn_cast<Instruction>(RetI->getOperand(0))
5964                            : dyn_cast<Instruction>(RetI->getOperand(1));
5965 
5966   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5967 
5968   Instruction *Op0, *Op1;
5969   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5970       match(RedOp,
5971             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5972       match(Op0, m_ZExtOrSExt(m_Value())) &&
5973       Op0->getOpcode() == Op1->getOpcode() &&
5974       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5975       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5976       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5977 
5978     // Matched reduce.add(ext(mul(ext(A), ext(B)))
5979     // Note that the extend opcodes need to all match, or if A==B they will have
5980     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5981     // which is equally fine.
5982     bool IsUnsigned = isa<ZExtInst>(Op0);
5983     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5984     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5985 
5986     InstructionCost ExtCost =
5987         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5988                              TTI::CastContextHint::None, CostKind, Op0);
5989     InstructionCost MulCost =
5990         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5991     InstructionCost Ext2Cost =
5992         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5993                              TTI::CastContextHint::None, CostKind, RedOp);
5994 
5995     InstructionCost RedCost = TTI.getMulAccReductionCost(
5996         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5997 
5998     if (RedCost.isValid() &&
5999         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6000       return I == RetI ? RedCost : 0;
6001   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6002              !TheLoop->isLoopInvariant(RedOp)) {
6003     // Matched reduce(ext(A))
6004     bool IsUnsigned = isa<ZExtInst>(RedOp);
6005     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6006     InstructionCost RedCost = TTI.getExtendedReductionCost(
6007         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6008         RdxDesc.getFastMathFlags(), CostKind);
6009 
6010     InstructionCost ExtCost =
6011         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6012                              TTI::CastContextHint::None, CostKind, RedOp);
6013     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6014       return I == RetI ? RedCost : 0;
6015   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6016              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6017     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6018         Op0->getOpcode() == Op1->getOpcode() &&
6019         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6020       bool IsUnsigned = isa<ZExtInst>(Op0);
6021       Type *Op0Ty = Op0->getOperand(0)->getType();
6022       Type *Op1Ty = Op1->getOperand(0)->getType();
6023       Type *LargestOpTy =
6024           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6025                                                                     : Op0Ty;
6026       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6027 
6028       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6029       // different sizes. We take the largest type as the ext to reduce, and add
6030       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6031       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6032           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6033           TTI::CastContextHint::None, CostKind, Op0);
6034       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6035           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6036           TTI::CastContextHint::None, CostKind, Op1);
6037       InstructionCost MulCost =
6038           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6039 
6040       InstructionCost RedCost = TTI.getMulAccReductionCost(
6041           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6042       InstructionCost ExtraExtCost = 0;
6043       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6044         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6045         ExtraExtCost = TTI.getCastInstrCost(
6046             ExtraExtOp->getOpcode(), ExtType,
6047             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6048             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6049       }
6050 
6051       if (RedCost.isValid() &&
6052           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6053         return I == RetI ? RedCost : 0;
6054     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6055       // Matched reduce.add(mul())
6056       InstructionCost MulCost =
6057           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6058 
6059       InstructionCost RedCost = TTI.getMulAccReductionCost(
6060           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6061 
6062       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6063         return I == RetI ? RedCost : 0;
6064     }
6065   }
6066 
6067   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6068 }
6069 
6070 InstructionCost
6071 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6072                                                      ElementCount VF) {
6073   // Calculate scalar cost only. Vectorization cost should be ready at this
6074   // moment.
6075   if (VF.isScalar()) {
6076     Type *ValTy = getLoadStoreType(I);
6077     const Align Alignment = getLoadStoreAlignment(I);
6078     unsigned AS = getLoadStoreAddressSpace(I);
6079 
6080     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6081     return TTI.getAddressComputationCost(ValTy) +
6082            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
6083                                OpInfo, I);
6084   }
6085   return getWideningCost(I, VF);
6086 }
6087 
6088 InstructionCost
6089 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6090                                                      ElementCount VF) const {
6091 
6092   // There is no mechanism yet to create a scalable scalarization loop,
6093   // so this is currently Invalid.
6094   if (VF.isScalable())
6095     return InstructionCost::getInvalid();
6096 
6097   if (VF.isScalar())
6098     return 0;
6099 
6100   InstructionCost Cost = 0;
6101   Type *RetTy = toVectorTy(I->getType(), VF);
6102   if (!RetTy->isVoidTy() &&
6103       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6104     Cost += TTI.getScalarizationOverhead(
6105         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6106         /*Insert*/ true,
6107         /*Extract*/ false, CostKind);
6108 
6109   // Some targets keep addresses scalar.
6110   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6111     return Cost;
6112 
6113   // Some targets support efficient element stores.
6114   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6115     return Cost;
6116 
6117   // Collect operands to consider.
6118   CallInst *CI = dyn_cast<CallInst>(I);
6119   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6120 
6121   // Skip operands that do not require extraction/scalarization and do not incur
6122   // any overhead.
6123   SmallVector<Type *> Tys;
6124   for (auto *V : filterExtractingOperands(Ops, VF))
6125     Tys.push_back(maybeVectorizeType(V->getType(), VF));
6126   return Cost + TTI.getOperandsScalarizationOverhead(
6127                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6128 }
6129 
6130 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6131   if (VF.isScalar())
6132     return;
6133   NumPredStores = 0;
6134   for (BasicBlock *BB : TheLoop->blocks()) {
6135     // For each instruction in the old loop.
6136     for (Instruction &I : *BB) {
6137       Value *Ptr =  getLoadStorePointerOperand(&I);
6138       if (!Ptr)
6139         continue;
6140 
6141       // TODO: We should generate better code and update the cost model for
6142       // predicated uniform stores. Today they are treated as any other
6143       // predicated store (see added test cases in
6144       // invariant-store-vectorization.ll).
6145       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6146         NumPredStores++;
6147 
6148       if (Legal->isUniformMemOp(I, VF)) {
6149         auto IsLegalToScalarize = [&]() {
6150           if (!VF.isScalable())
6151             // Scalarization of fixed length vectors "just works".
6152             return true;
6153 
6154           // We have dedicated lowering for unpredicated uniform loads and
6155           // stores.  Note that even with tail folding we know that at least
6156           // one lane is active (i.e. generalized predication is not possible
6157           // here), and the logic below depends on this fact.
6158           if (!foldTailByMasking())
6159             return true;
6160 
6161           // For scalable vectors, a uniform memop load is always
6162           // uniform-by-parts  and we know how to scalarize that.
6163           if (isa<LoadInst>(I))
6164             return true;
6165 
6166           // A uniform store isn't neccessarily uniform-by-part
6167           // and we can't assume scalarization.
6168           auto &SI = cast<StoreInst>(I);
6169           return TheLoop->isLoopInvariant(SI.getValueOperand());
6170         };
6171 
6172         const InstructionCost GatherScatterCost =
6173           isLegalGatherOrScatter(&I, VF) ?
6174           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6175 
6176         // Load: Scalar load + broadcast
6177         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6178         // FIXME: This cost is a significant under-estimate for tail folded
6179         // memory ops.
6180         const InstructionCost ScalarizationCost =
6181             IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6182                                  : InstructionCost::getInvalid();
6183 
6184         // Choose better solution for the current VF,  Note that Invalid
6185         // costs compare as maximumal large.  If both are invalid, we get
6186         // scalable invalid which signals a failure and a vectorization abort.
6187         if (GatherScatterCost < ScalarizationCost)
6188           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6189         else
6190           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6191         continue;
6192       }
6193 
6194       // We assume that widening is the best solution when possible.
6195       if (memoryInstructionCanBeWidened(&I, VF)) {
6196         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6197         int ConsecutiveStride = Legal->isConsecutivePtr(
6198             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6199         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6200                "Expected consecutive stride.");
6201         InstWidening Decision =
6202             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6203         setWideningDecision(&I, VF, Decision, Cost);
6204         continue;
6205       }
6206 
6207       // Choose between Interleaving, Gather/Scatter or Scalarization.
6208       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6209       unsigned NumAccesses = 1;
6210       if (isAccessInterleaved(&I)) {
6211         const auto *Group = getInterleavedAccessGroup(&I);
6212         assert(Group && "Fail to get an interleaved access group.");
6213 
6214         // Make one decision for the whole group.
6215         if (getWideningDecision(&I, VF) != CM_Unknown)
6216           continue;
6217 
6218         NumAccesses = Group->getNumMembers();
6219         if (interleavedAccessCanBeWidened(&I, VF))
6220           InterleaveCost = getInterleaveGroupCost(&I, VF);
6221       }
6222 
6223       InstructionCost GatherScatterCost =
6224           isLegalGatherOrScatter(&I, VF)
6225               ? getGatherScatterCost(&I, VF) * NumAccesses
6226               : InstructionCost::getInvalid();
6227 
6228       InstructionCost ScalarizationCost =
6229           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6230 
6231       // Choose better solution for the current VF,
6232       // write down this decision and use it during vectorization.
6233       InstructionCost Cost;
6234       InstWidening Decision;
6235       if (InterleaveCost <= GatherScatterCost &&
6236           InterleaveCost < ScalarizationCost) {
6237         Decision = CM_Interleave;
6238         Cost = InterleaveCost;
6239       } else if (GatherScatterCost < ScalarizationCost) {
6240         Decision = CM_GatherScatter;
6241         Cost = GatherScatterCost;
6242       } else {
6243         Decision = CM_Scalarize;
6244         Cost = ScalarizationCost;
6245       }
6246       // If the instructions belongs to an interleave group, the whole group
6247       // receives the same decision. The whole group receives the cost, but
6248       // the cost will actually be assigned to one instruction.
6249       if (const auto *Group = getInterleavedAccessGroup(&I))
6250         setWideningDecision(Group, VF, Decision, Cost);
6251       else
6252         setWideningDecision(&I, VF, Decision, Cost);
6253     }
6254   }
6255 
6256   // Make sure that any load of address and any other address computation
6257   // remains scalar unless there is gather/scatter support. This avoids
6258   // inevitable extracts into address registers, and also has the benefit of
6259   // activating LSR more, since that pass can't optimize vectorized
6260   // addresses.
6261   if (TTI.prefersVectorizedAddressing())
6262     return;
6263 
6264   // Start with all scalar pointer uses.
6265   SmallPtrSet<Instruction *, 8> AddrDefs;
6266   for (BasicBlock *BB : TheLoop->blocks())
6267     for (Instruction &I : *BB) {
6268       Instruction *PtrDef =
6269         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6270       if (PtrDef && TheLoop->contains(PtrDef) &&
6271           getWideningDecision(&I, VF) != CM_GatherScatter)
6272         AddrDefs.insert(PtrDef);
6273     }
6274 
6275   // Add all instructions used to generate the addresses.
6276   SmallVector<Instruction *, 4> Worklist;
6277   append_range(Worklist, AddrDefs);
6278   while (!Worklist.empty()) {
6279     Instruction *I = Worklist.pop_back_val();
6280     for (auto &Op : I->operands())
6281       if (auto *InstOp = dyn_cast<Instruction>(Op))
6282         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6283             AddrDefs.insert(InstOp).second)
6284           Worklist.push_back(InstOp);
6285   }
6286 
6287   for (auto *I : AddrDefs) {
6288     if (isa<LoadInst>(I)) {
6289       // Setting the desired widening decision should ideally be handled in
6290       // by cost functions, but since this involves the task of finding out
6291       // if the loaded register is involved in an address computation, it is
6292       // instead changed here when we know this is the case.
6293       InstWidening Decision = getWideningDecision(I, VF);
6294       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6295         // Scalarize a widened load of address.
6296         setWideningDecision(
6297             I, VF, CM_Scalarize,
6298             (VF.getKnownMinValue() *
6299              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6300       else if (const auto *Group = getInterleavedAccessGroup(I)) {
6301         // Scalarize an interleave group of address loads.
6302         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6303           if (Instruction *Member = Group->getMember(I))
6304             setWideningDecision(
6305                 Member, VF, CM_Scalarize,
6306                 (VF.getKnownMinValue() *
6307                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6308         }
6309       }
6310     } else
6311       // Make sure I gets scalarized and a cost estimate without
6312       // scalarization overhead.
6313       ForcedScalars[VF].insert(I);
6314   }
6315 }
6316 
6317 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6318   assert(!VF.isScalar() &&
6319          "Trying to set a vectorization decision for a scalar VF");
6320 
6321   auto ForcedScalar = ForcedScalars.find(VF);
6322   for (BasicBlock *BB : TheLoop->blocks()) {
6323     // For each instruction in the old loop.
6324     for (Instruction &I : *BB) {
6325       CallInst *CI = dyn_cast<CallInst>(&I);
6326 
6327       if (!CI)
6328         continue;
6329 
6330       InstructionCost ScalarCost = InstructionCost::getInvalid();
6331       InstructionCost VectorCost = InstructionCost::getInvalid();
6332       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6333       Function *ScalarFunc = CI->getCalledFunction();
6334       Type *ScalarRetTy = CI->getType();
6335       SmallVector<Type *, 4> Tys, ScalarTys;
6336       for (auto &ArgOp : CI->args())
6337         ScalarTys.push_back(ArgOp->getType());
6338 
6339       // Estimate cost of scalarized vector call. The source operands are
6340       // assumed to be vectors, so we need to extract individual elements from
6341       // there, execute VF scalar calls, and then gather the result into the
6342       // vector return value.
6343       InstructionCost ScalarCallCost =
6344           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6345 
6346       // Compute costs of unpacking argument values for the scalar calls and
6347       // packing the return values to a vector.
6348       InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
6349 
6350       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6351       // Honor ForcedScalars and UniformAfterVectorization decisions.
6352       // TODO: For calls, it might still be more profitable to widen. Use
6353       // VPlan-based cost model to compare different options.
6354       if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6355                              ForcedScalar->second.contains(CI)) ||
6356                             isUniformAfterVectorization(CI, VF))) {
6357         setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6358                                 Intrinsic::not_intrinsic, std::nullopt,
6359                                 ScalarCost);
6360         continue;
6361       }
6362 
6363       bool MaskRequired = Legal->isMaskRequired(CI);
6364       // Compute corresponding vector type for return value and arguments.
6365       Type *RetTy = toVectorTy(ScalarRetTy, VF);
6366       for (Type *ScalarTy : ScalarTys)
6367         Tys.push_back(toVectorTy(ScalarTy, VF));
6368 
6369       // An in-loop reduction using an fmuladd intrinsic is a special case;
6370       // we don't want the normal cost for that intrinsic.
6371       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6372         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
6373           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6374                                   getVectorIntrinsicIDForCall(CI, TLI),
6375                                   std::nullopt, *RedCost);
6376           continue;
6377         }
6378 
6379       // Find the cost of vectorizing the call, if we can find a suitable
6380       // vector variant of the function.
6381       bool UsesMask = false;
6382       VFInfo FuncInfo;
6383       Function *VecFunc = nullptr;
6384       // Search through any available variants for one we can use at this VF.
6385       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6386         // Must match requested VF.
6387         if (Info.Shape.VF != VF)
6388           continue;
6389 
6390         // Must take a mask argument if one is required
6391         if (MaskRequired && !Info.isMasked())
6392           continue;
6393 
6394         // Check that all parameter kinds are supported
6395         bool ParamsOk = true;
6396         for (VFParameter Param : Info.Shape.Parameters) {
6397           switch (Param.ParamKind) {
6398           case VFParamKind::Vector:
6399             break;
6400           case VFParamKind::OMP_Uniform: {
6401             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6402             // Make sure the scalar parameter in the loop is invariant.
6403             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6404                                               TheLoop))
6405               ParamsOk = false;
6406             break;
6407           }
6408           case VFParamKind::OMP_Linear: {
6409             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6410             // Find the stride for the scalar parameter in this loop and see if
6411             // it matches the stride for the variant.
6412             // TODO: do we need to figure out the cost of an extract to get the
6413             // first lane? Or do we hope that it will be folded away?
6414             ScalarEvolution *SE = PSE.getSE();
6415             const auto *SAR =
6416                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6417 
6418             if (!SAR || SAR->getLoop() != TheLoop) {
6419               ParamsOk = false;
6420               break;
6421             }
6422 
6423             const SCEVConstant *Step =
6424                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6425 
6426             if (!Step ||
6427                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6428               ParamsOk = false;
6429 
6430             break;
6431           }
6432           case VFParamKind::GlobalPredicate:
6433             UsesMask = true;
6434             break;
6435           default:
6436             ParamsOk = false;
6437             break;
6438           }
6439         }
6440 
6441         if (!ParamsOk)
6442           continue;
6443 
6444         // Found a suitable candidate, stop here.
6445         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6446         FuncInfo = Info;
6447         break;
6448       }
6449 
6450       // Add in the cost of synthesizing a mask if one wasn't required.
6451       InstructionCost MaskCost = 0;
6452       if (VecFunc && UsesMask && !MaskRequired)
6453         MaskCost = TTI.getShuffleCost(
6454             TargetTransformInfo::SK_Broadcast,
6455             VectorType::get(IntegerType::getInt1Ty(
6456                                 VecFunc->getFunctionType()->getContext()),
6457                             VF),
6458             {}, CostKind);
6459 
6460       if (TLI && VecFunc && !CI->isNoBuiltin())
6461         VectorCost =
6462             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6463 
6464       // Find the cost of an intrinsic; some targets may have instructions that
6465       // perform the operation without needing an actual call.
6466       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6467       if (IID != Intrinsic::not_intrinsic)
6468         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6469 
6470       InstructionCost Cost = ScalarCost;
6471       InstWidening Decision = CM_Scalarize;
6472 
6473       if (VectorCost <= Cost) {
6474         Cost = VectorCost;
6475         Decision = CM_VectorCall;
6476       }
6477 
6478       if (IntrinsicCost <= Cost) {
6479         Cost = IntrinsicCost;
6480         Decision = CM_IntrinsicCall;
6481       }
6482 
6483       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6484                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6485     }
6486   }
6487 }
6488 
6489 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6490   if (!Legal->isInvariant(Op))
6491     return false;
6492   // Consider Op invariant, if it or its operands aren't predicated
6493   // instruction in the loop. In that case, it is not trivially hoistable.
6494   auto *OpI = dyn_cast<Instruction>(Op);
6495   return !OpI || !TheLoop->contains(OpI) ||
6496          (!isPredicatedInst(OpI) &&
6497           (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6498           all_of(OpI->operands(),
6499                  [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6500 }
6501 
6502 InstructionCost
6503 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6504                                                ElementCount VF) {
6505   // If we know that this instruction will remain uniform, check the cost of
6506   // the scalar version.
6507   if (isUniformAfterVectorization(I, VF))
6508     VF = ElementCount::getFixed(1);
6509 
6510   if (VF.isVector() && isProfitableToScalarize(I, VF))
6511     return InstsToScalarize[VF][I];
6512 
6513   // Forced scalars do not have any scalarization overhead.
6514   auto ForcedScalar = ForcedScalars.find(VF);
6515   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6516     auto InstSet = ForcedScalar->second;
6517     if (InstSet.count(I))
6518       return getInstructionCost(I, ElementCount::getFixed(1)) *
6519              VF.getKnownMinValue();
6520   }
6521 
6522   Type *RetTy = I->getType();
6523   if (canTruncateToMinimalBitwidth(I, VF))
6524     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6525   auto *SE = PSE.getSE();
6526 
6527   auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6528                                                 ElementCount VF) -> bool {
6529     if (VF.isScalar())
6530       return true;
6531 
6532     auto Scalarized = InstsToScalarize.find(VF);
6533     assert(Scalarized != InstsToScalarize.end() &&
6534            "VF not yet analyzed for scalarization profitability");
6535     return !Scalarized->second.count(I) &&
6536            llvm::all_of(I->users(), [&](User *U) {
6537              auto *UI = cast<Instruction>(U);
6538              return !Scalarized->second.count(UI);
6539            });
6540   };
6541   (void)HasSingleCopyAfterVectorization;
6542 
6543   Type *VectorTy;
6544   if (isScalarAfterVectorization(I, VF)) {
6545     // With the exception of GEPs and PHIs, after scalarization there should
6546     // only be one copy of the instruction generated in the loop. This is
6547     // because the VF is either 1, or any instructions that need scalarizing
6548     // have already been dealt with by the time we get here. As a result,
6549     // it means we don't have to multiply the instruction cost by VF.
6550     assert(I->getOpcode() == Instruction::GetElementPtr ||
6551            I->getOpcode() == Instruction::PHI ||
6552            (I->getOpcode() == Instruction::BitCast &&
6553             I->getType()->isPointerTy()) ||
6554            HasSingleCopyAfterVectorization(I, VF));
6555     VectorTy = RetTy;
6556   } else
6557     VectorTy = toVectorTy(RetTy, VF);
6558 
6559   if (VF.isVector() && VectorTy->isVectorTy() &&
6560       !TTI.getNumberOfParts(VectorTy))
6561     return InstructionCost::getInvalid();
6562 
6563   // TODO: We need to estimate the cost of intrinsic calls.
6564   switch (I->getOpcode()) {
6565   case Instruction::GetElementPtr:
6566     // We mark this instruction as zero-cost because the cost of GEPs in
6567     // vectorized code depends on whether the corresponding memory instruction
6568     // is scalarized or not. Therefore, we handle GEPs with the memory
6569     // instruction cost.
6570     return 0;
6571   case Instruction::Br: {
6572     // In cases of scalarized and predicated instructions, there will be VF
6573     // predicated blocks in the vectorized loop. Each branch around these
6574     // blocks requires also an extract of its vector compare i1 element.
6575     // Note that the conditional branch from the loop latch will be replaced by
6576     // a single branch controlling the loop, so there is no extra overhead from
6577     // scalarization.
6578     bool ScalarPredicatedBB = false;
6579     BranchInst *BI = cast<BranchInst>(I);
6580     if (VF.isVector() && BI->isConditional() &&
6581         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6582          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6583         BI->getParent() != TheLoop->getLoopLatch())
6584       ScalarPredicatedBB = true;
6585 
6586     if (ScalarPredicatedBB) {
6587       // Not possible to scalarize scalable vector with predicated instructions.
6588       if (VF.isScalable())
6589         return InstructionCost::getInvalid();
6590       // Return cost for branches around scalarized and predicated blocks.
6591       auto *VecI1Ty =
6592           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6593       return (
6594           TTI.getScalarizationOverhead(
6595               VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6596               /*Insert*/ false, /*Extract*/ true, CostKind) +
6597           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6598     }
6599 
6600     if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6601       // The back-edge branch will remain, as will all scalar branches.
6602       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6603 
6604     // This branch will be eliminated by if-conversion.
6605     return 0;
6606     // Note: We currently assume zero cost for an unconditional branch inside
6607     // a predicated block since it will become a fall-through, although we
6608     // may decide in the future to call TTI for all branches.
6609   }
6610   case Instruction::Switch: {
6611     if (VF.isScalar())
6612       return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6613     auto *Switch = cast<SwitchInst>(I);
6614     return Switch->getNumCases() *
6615            TTI.getCmpSelInstrCost(
6616                Instruction::ICmp,
6617                toVectorTy(Switch->getCondition()->getType(), VF),
6618                toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6619                CmpInst::ICMP_EQ, CostKind);
6620   }
6621   case Instruction::PHI: {
6622     auto *Phi = cast<PHINode>(I);
6623 
6624     // First-order recurrences are replaced by vector shuffles inside the loop.
6625     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6626       // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6627       // penultimate value of the recurrence.
6628       // TODO: Consider vscale_range info.
6629       if (VF.isScalable() && VF.getKnownMinValue() == 1)
6630         return InstructionCost::getInvalid();
6631       SmallVector<int> Mask(VF.getKnownMinValue());
6632       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6633       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6634                                 cast<VectorType>(VectorTy), Mask, CostKind,
6635                                 VF.getKnownMinValue() - 1);
6636     }
6637 
6638     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6639     // converted into select instructions. We require N - 1 selects per phi
6640     // node, where N is the number of incoming values.
6641     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6642       Type *ResultTy = Phi->getType();
6643 
6644       // All instructions in an Any-of reduction chain are narrowed to bool.
6645       // Check if that is the case for this phi node.
6646       auto *HeaderUser = cast_if_present<PHINode>(
6647           find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6648             auto *Phi = dyn_cast<PHINode>(U);
6649             if (Phi && Phi->getParent() == TheLoop->getHeader())
6650               return Phi;
6651             return nullptr;
6652           }));
6653       if (HeaderUser) {
6654         auto &ReductionVars = Legal->getReductionVars();
6655         auto Iter = ReductionVars.find(HeaderUser);
6656         if (Iter != ReductionVars.end() &&
6657             RecurrenceDescriptor::isAnyOfRecurrenceKind(
6658                 Iter->second.getRecurrenceKind()))
6659           ResultTy = Type::getInt1Ty(Phi->getContext());
6660       }
6661       return (Phi->getNumIncomingValues() - 1) *
6662              TTI.getCmpSelInstrCost(
6663                  Instruction::Select, toVectorTy(ResultTy, VF),
6664                  toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6665                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6666     }
6667 
6668     // When tail folding with EVL, if the phi is part of an out of loop
6669     // reduction then it will be transformed into a wide vp_merge.
6670     if (VF.isVector() && foldTailWithEVL() &&
6671         Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6672       IntrinsicCostAttributes ICA(
6673           Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6674           {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6675       return TTI.getIntrinsicInstrCost(ICA, CostKind);
6676     }
6677 
6678     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6679   }
6680   case Instruction::UDiv:
6681   case Instruction::SDiv:
6682   case Instruction::URem:
6683   case Instruction::SRem:
6684     if (VF.isVector() && isPredicatedInst(I)) {
6685       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6686       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6687         ScalarCost : SafeDivisorCost;
6688     }
6689     // We've proven all lanes safe to speculate, fall through.
6690     [[fallthrough]];
6691   case Instruction::Add:
6692   case Instruction::Sub: {
6693     auto Info = Legal->getHistogramInfo(I);
6694     if (Info && VF.isVector()) {
6695       const HistogramInfo *HGram = Info.value();
6696       // Assume that a non-constant update value (or a constant != 1) requires
6697       // a multiply, and add that into the cost.
6698       InstructionCost MulCost = TTI::TCC_Free;
6699       ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6700       if (!RHS || RHS->getZExtValue() != 1)
6701         MulCost =
6702             TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6703 
6704       // Find the cost of the histogram operation itself.
6705       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6706       Type *ScalarTy = I->getType();
6707       Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6708       IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6709                                   Type::getVoidTy(I->getContext()),
6710                                   {PtrTy, ScalarTy, MaskTy});
6711 
6712       // Add the costs together with the add/sub operation.
6713       return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6714              TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6715     }
6716     [[fallthrough]];
6717   }
6718   case Instruction::FAdd:
6719   case Instruction::FSub:
6720   case Instruction::Mul:
6721   case Instruction::FMul:
6722   case Instruction::FDiv:
6723   case Instruction::FRem:
6724   case Instruction::Shl:
6725   case Instruction::LShr:
6726   case Instruction::AShr:
6727   case Instruction::And:
6728   case Instruction::Or:
6729   case Instruction::Xor: {
6730     // If we're speculating on the stride being 1, the multiplication may
6731     // fold away.  We can generalize this for all operations using the notion
6732     // of neutral elements.  (TODO)
6733     if (I->getOpcode() == Instruction::Mul &&
6734         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6735          PSE.getSCEV(I->getOperand(1))->isOne()))
6736       return 0;
6737 
6738     // Detect reduction patterns
6739     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6740       return *RedCost;
6741 
6742     // Certain instructions can be cheaper to vectorize if they have a constant
6743     // second vector operand. One example of this are shifts on x86.
6744     Value *Op2 = I->getOperand(1);
6745     if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6746         isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6747       Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6748     }
6749     auto Op2Info = TTI.getOperandInfo(Op2);
6750     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6751         shouldConsiderInvariant(Op2))
6752       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6753 
6754     SmallVector<const Value *, 4> Operands(I->operand_values());
6755     return TTI.getArithmeticInstrCost(
6756         I->getOpcode(), VectorTy, CostKind,
6757         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6758         Op2Info, Operands, I, TLI);
6759   }
6760   case Instruction::FNeg: {
6761     return TTI.getArithmeticInstrCost(
6762         I->getOpcode(), VectorTy, CostKind,
6763         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6764         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6765         I->getOperand(0), I);
6766   }
6767   case Instruction::Select: {
6768     SelectInst *SI = cast<SelectInst>(I);
6769     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6770     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6771 
6772     const Value *Op0, *Op1;
6773     using namespace llvm::PatternMatch;
6774     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6775                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6776       // select x, y, false --> x & y
6777       // select x, true, y --> x | y
6778       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6779       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6780       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6781               Op1->getType()->getScalarSizeInBits() == 1);
6782 
6783       SmallVector<const Value *, 2> Operands{Op0, Op1};
6784       return TTI.getArithmeticInstrCost(
6785           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6786           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6787     }
6788 
6789     Type *CondTy = SI->getCondition()->getType();
6790     if (!ScalarCond)
6791       CondTy = VectorType::get(CondTy, VF);
6792 
6793     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6794     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6795       Pred = Cmp->getPredicate();
6796     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6797                                   CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6798                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6799   }
6800   case Instruction::ICmp:
6801   case Instruction::FCmp: {
6802     Type *ValTy = I->getOperand(0)->getType();
6803 
6804     if (canTruncateToMinimalBitwidth(I, VF)) {
6805       Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6806       (void)Op0AsInstruction;
6807       assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6808               MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6809              "if both the operand and the compare are marked for "
6810              "truncation, they must have the same bitwidth");
6811       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6812     }
6813 
6814     VectorTy = toVectorTy(ValTy, VF);
6815     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6816                                   cast<CmpInst>(I)->getPredicate(), CostKind,
6817                                   {TTI::OK_AnyValue, TTI::OP_None},
6818                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6819   }
6820   case Instruction::Store:
6821   case Instruction::Load: {
6822     ElementCount Width = VF;
6823     if (Width.isVector()) {
6824       InstWidening Decision = getWideningDecision(I, Width);
6825       assert(Decision != CM_Unknown &&
6826              "CM decision should be taken at this point");
6827       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6828         return InstructionCost::getInvalid();
6829       if (Decision == CM_Scalarize)
6830         Width = ElementCount::getFixed(1);
6831     }
6832     VectorTy = toVectorTy(getLoadStoreType(I), Width);
6833     return getMemoryInstructionCost(I, VF);
6834   }
6835   case Instruction::BitCast:
6836     if (I->getType()->isPointerTy())
6837       return 0;
6838     [[fallthrough]];
6839   case Instruction::ZExt:
6840   case Instruction::SExt:
6841   case Instruction::FPToUI:
6842   case Instruction::FPToSI:
6843   case Instruction::FPExt:
6844   case Instruction::PtrToInt:
6845   case Instruction::IntToPtr:
6846   case Instruction::SIToFP:
6847   case Instruction::UIToFP:
6848   case Instruction::Trunc:
6849   case Instruction::FPTrunc: {
6850     // Computes the CastContextHint from a Load/Store instruction.
6851     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6852       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6853              "Expected a load or a store!");
6854 
6855       if (VF.isScalar() || !TheLoop->contains(I))
6856         return TTI::CastContextHint::Normal;
6857 
6858       switch (getWideningDecision(I, VF)) {
6859       case LoopVectorizationCostModel::CM_GatherScatter:
6860         return TTI::CastContextHint::GatherScatter;
6861       case LoopVectorizationCostModel::CM_Interleave:
6862         return TTI::CastContextHint::Interleave;
6863       case LoopVectorizationCostModel::CM_Scalarize:
6864       case LoopVectorizationCostModel::CM_Widen:
6865         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6866                                         : TTI::CastContextHint::Normal;
6867       case LoopVectorizationCostModel::CM_Widen_Reverse:
6868         return TTI::CastContextHint::Reversed;
6869       case LoopVectorizationCostModel::CM_Unknown:
6870         llvm_unreachable("Instr did not go through cost modelling?");
6871       case LoopVectorizationCostModel::CM_VectorCall:
6872       case LoopVectorizationCostModel::CM_IntrinsicCall:
6873         llvm_unreachable_internal("Instr has invalid widening decision");
6874       }
6875 
6876       llvm_unreachable("Unhandled case!");
6877     };
6878 
6879     unsigned Opcode = I->getOpcode();
6880     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6881     // For Trunc, the context is the only user, which must be a StoreInst.
6882     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6883       if (I->hasOneUse())
6884         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6885           CCH = ComputeCCH(Store);
6886     }
6887     // For Z/Sext, the context is the operand, which must be a LoadInst.
6888     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6889              Opcode == Instruction::FPExt) {
6890       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6891         CCH = ComputeCCH(Load);
6892     }
6893 
6894     // We optimize the truncation of induction variables having constant
6895     // integer steps. The cost of these truncations is the same as the scalar
6896     // operation.
6897     if (isOptimizableIVTruncate(I, VF)) {
6898       auto *Trunc = cast<TruncInst>(I);
6899       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6900                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6901     }
6902 
6903     // Detect reduction patterns
6904     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6905       return *RedCost;
6906 
6907     Type *SrcScalarTy = I->getOperand(0)->getType();
6908     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6909     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6910       SrcScalarTy =
6911           IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6912     Type *SrcVecTy =
6913         VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6914 
6915     if (canTruncateToMinimalBitwidth(I, VF)) {
6916       // If the result type is <= the source type, there will be no extend
6917       // after truncating the users to the minimal required bitwidth.
6918       if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6919           (I->getOpcode() == Instruction::ZExt ||
6920            I->getOpcode() == Instruction::SExt))
6921         return 0;
6922     }
6923 
6924     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6925   }
6926   case Instruction::Call:
6927     return getVectorCallCost(cast<CallInst>(I), VF);
6928   case Instruction::ExtractValue:
6929     return TTI.getInstructionCost(I, CostKind);
6930   case Instruction::Alloca:
6931     // We cannot easily widen alloca to a scalable alloca, as
6932     // the result would need to be a vector of pointers.
6933     if (VF.isScalable())
6934       return InstructionCost::getInvalid();
6935     [[fallthrough]];
6936   default:
6937     // This opcode is unknown. Assume that it is the same as 'mul'.
6938     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6939   } // end of switch.
6940 }
6941 
6942 void LoopVectorizationCostModel::collectValuesToIgnore() {
6943   // Ignore ephemeral values.
6944   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6945 
6946   SmallVector<Value *, 4> DeadInterleavePointerOps;
6947   SmallVector<Value *, 4> DeadOps;
6948 
6949   // If a scalar epilogue is required, users outside the loop won't use
6950   // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6951   // that is the case.
6952   bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6953   auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6954     return RequiresScalarEpilogue &&
6955            !TheLoop->contains(cast<Instruction>(U)->getParent());
6956   };
6957 
6958   LoopBlocksDFS DFS(TheLoop);
6959   DFS.perform(LI);
6960   MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6961   for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6962     for (Instruction &I : reverse(*BB)) {
6963       // Find all stores to invariant variables. Since they are going to sink
6964       // outside the loop we do not need calculate cost for them.
6965       StoreInst *SI;
6966       if ((SI = dyn_cast<StoreInst>(&I)) &&
6967           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6968         ValuesToIgnore.insert(&I);
6969         DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6970             SI->getValueOperand());
6971       }
6972 
6973       if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6974         continue;
6975 
6976       // Add instructions that would be trivially dead and are only used by
6977       // values already ignored to DeadOps to seed worklist.
6978       if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6979           all_of(I.users(), [this, IsLiveOutDead](User *U) {
6980             return VecValuesToIgnore.contains(U) ||
6981                    ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6982           }))
6983         DeadOps.push_back(&I);
6984 
6985       // For interleave groups, we only create a pointer for the start of the
6986       // interleave group. Queue up addresses of group members except the insert
6987       // position for further processing.
6988       if (isAccessInterleaved(&I)) {
6989         auto *Group = getInterleavedAccessGroup(&I);
6990         if (Group->getInsertPos() == &I)
6991           continue;
6992         Value *PointerOp = getLoadStorePointerOperand(&I);
6993         DeadInterleavePointerOps.push_back(PointerOp);
6994       }
6995 
6996       // Queue branches for analysis. They are dead, if their successors only
6997       // contain dead instructions.
6998       if (auto *Br = dyn_cast<BranchInst>(&I)) {
6999         if (Br->isConditional())
7000           DeadOps.push_back(&I);
7001       }
7002     }
7003 
7004   // Mark ops feeding interleave group members as free, if they are only used
7005   // by other dead computations.
7006   for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
7007     auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
7008     if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
7009           Instruction *UI = cast<Instruction>(U);
7010           return !VecValuesToIgnore.contains(U) &&
7011                  (!isAccessInterleaved(UI) ||
7012                   getInterleavedAccessGroup(UI)->getInsertPos() == UI);
7013         }))
7014       continue;
7015     VecValuesToIgnore.insert(Op);
7016     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
7017   }
7018 
7019   for (const auto &[_, Ops] : DeadInvariantStoreOps) {
7020     for (Value *Op : ArrayRef(Ops).drop_back())
7021       DeadOps.push_back(Op);
7022   }
7023   // Mark ops that would be trivially dead and are only used by ignored
7024   // instructions as free.
7025   BasicBlock *Header = TheLoop->getHeader();
7026 
7027   // Returns true if the block contains only dead instructions. Such blocks will
7028   // be removed by VPlan-to-VPlan transforms and won't be considered by the
7029   // VPlan-based cost model, so skip them in the legacy cost-model as well.
7030   auto IsEmptyBlock = [this](BasicBlock *BB) {
7031     return all_of(*BB, [this](Instruction &I) {
7032       return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
7033              (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
7034     });
7035   };
7036   for (unsigned I = 0; I != DeadOps.size(); ++I) {
7037     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
7038 
7039     // Check if the branch should be considered dead.
7040     if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
7041       BasicBlock *ThenBB = Br->getSuccessor(0);
7042       BasicBlock *ElseBB = Br->getSuccessor(1);
7043       // Don't considers branches leaving the loop for simplification.
7044       if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
7045         continue;
7046       bool ThenEmpty = IsEmptyBlock(ThenBB);
7047       bool ElseEmpty = IsEmptyBlock(ElseBB);
7048       if ((ThenEmpty && ElseEmpty) ||
7049           (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
7050            ElseBB->phis().empty()) ||
7051           (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
7052            ThenBB->phis().empty())) {
7053         VecValuesToIgnore.insert(Br);
7054         DeadOps.push_back(Br->getCondition());
7055       }
7056       continue;
7057     }
7058 
7059     // Skip any op that shouldn't be considered dead.
7060     if (!Op || !TheLoop->contains(Op) ||
7061         (isa<PHINode>(Op) && Op->getParent() == Header) ||
7062         !wouldInstructionBeTriviallyDead(Op, TLI) ||
7063         any_of(Op->users(), [this, IsLiveOutDead](User *U) {
7064           return !VecValuesToIgnore.contains(U) &&
7065                  !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
7066         }))
7067       continue;
7068 
7069     if (!TheLoop->contains(Op->getParent()))
7070       continue;
7071 
7072     // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
7073     // which applies for both scalar and vector versions. Otherwise it is only
7074     // dead in vector versions, so only add it to VecValuesToIgnore.
7075     if (all_of(Op->users(),
7076                [this](User *U) { return ValuesToIgnore.contains(U); }))
7077       ValuesToIgnore.insert(Op);
7078 
7079     VecValuesToIgnore.insert(Op);
7080     DeadOps.append(Op->op_begin(), Op->op_end());
7081   }
7082 
7083   // Ignore type-promoting instructions we identified during reduction
7084   // detection.
7085   for (const auto &Reduction : Legal->getReductionVars()) {
7086     const RecurrenceDescriptor &RedDes = Reduction.second;
7087     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7088     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7089   }
7090   // Ignore type-casting instructions we identified during induction
7091   // detection.
7092   for (const auto &Induction : Legal->getInductionVars()) {
7093     const InductionDescriptor &IndDes = Induction.second;
7094     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7095     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7096   }
7097 }
7098 
7099 void LoopVectorizationCostModel::collectInLoopReductions() {
7100   for (const auto &Reduction : Legal->getReductionVars()) {
7101     PHINode *Phi = Reduction.first;
7102     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7103 
7104     // We don't collect reductions that are type promoted (yet).
7105     if (RdxDesc.getRecurrenceType() != Phi->getType())
7106       continue;
7107 
7108     // If the target would prefer this reduction to happen "in-loop", then we
7109     // want to record it as such.
7110     unsigned Opcode = RdxDesc.getOpcode();
7111     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7112         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7113                                    TargetTransformInfo::ReductionFlags()))
7114       continue;
7115 
7116     // Check that we can correctly put the reductions into the loop, by
7117     // finding the chain of operations that leads from the phi to the loop
7118     // exit value.
7119     SmallVector<Instruction *, 4> ReductionOperations =
7120         RdxDesc.getReductionOpChain(Phi, TheLoop);
7121     bool InLoop = !ReductionOperations.empty();
7122 
7123     if (InLoop) {
7124       InLoopReductions.insert(Phi);
7125       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7126       Instruction *LastChain = Phi;
7127       for (auto *I : ReductionOperations) {
7128         InLoopReductionImmediateChains[I] = LastChain;
7129         LastChain = I;
7130       }
7131     }
7132     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7133                       << " reduction for phi: " << *Phi << "\n");
7134   }
7135 }
7136 
7137 // This function will select a scalable VF if the target supports scalable
7138 // vectors and a fixed one otherwise.
7139 // TODO: we could return a pair of values that specify the max VF and
7140 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7141 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7142 // doesn't have a cost model that can choose which plan to execute if
7143 // more than one is generated.
7144 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7145                                      LoopVectorizationCostModel &CM) {
7146   unsigned WidestType;
7147   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7148 
7149   TargetTransformInfo::RegisterKind RegKind =
7150       TTI.enableScalableVectorization()
7151           ? TargetTransformInfo::RGK_ScalableVector
7152           : TargetTransformInfo::RGK_FixedWidthVector;
7153 
7154   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7155   unsigned N = RegSize.getKnownMinValue() / WidestType;
7156   return ElementCount::get(N, RegSize.isScalable());
7157 }
7158 
7159 VectorizationFactor
7160 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7161   ElementCount VF = UserVF;
7162   // Outer loop handling: They may require CFG and instruction level
7163   // transformations before even evaluating whether vectorization is profitable.
7164   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7165   // the vectorization pipeline.
7166   if (!OrigLoop->isInnermost()) {
7167     // If the user doesn't provide a vectorization factor, determine a
7168     // reasonable one.
7169     if (UserVF.isZero()) {
7170       VF = determineVPlanVF(TTI, CM);
7171       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7172 
7173       // Make sure we have a VF > 1 for stress testing.
7174       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7175         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7176                           << "overriding computed VF.\n");
7177         VF = ElementCount::getFixed(4);
7178       }
7179     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7180                !ForceTargetSupportsScalableVectors) {
7181       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7182                         << "not supported by the target.\n");
7183       reportVectorizationFailure(
7184           "Scalable vectorization requested but not supported by the target",
7185           "the scalable user-specified vectorization width for outer-loop "
7186           "vectorization cannot be used because the target does not support "
7187           "scalable vectors.",
7188           "ScalableVFUnfeasible", ORE, OrigLoop);
7189       return VectorizationFactor::Disabled();
7190     }
7191     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7192     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7193            "VF needs to be a power of two");
7194     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7195                       << "VF " << VF << " to build VPlans.\n");
7196     buildVPlans(VF, VF);
7197 
7198     // For VPlan build stress testing, we bail out after VPlan construction.
7199     if (VPlanBuildStressTest)
7200       return VectorizationFactor::Disabled();
7201 
7202     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7203   }
7204 
7205   LLVM_DEBUG(
7206       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7207                 "VPlan-native path.\n");
7208   return VectorizationFactor::Disabled();
7209 }
7210 
7211 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7212   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7213   CM.collectValuesToIgnore();
7214   CM.collectElementTypesForWidening();
7215 
7216   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7217   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7218     return;
7219 
7220   // Invalidate interleave groups if all blocks of loop will be predicated.
7221   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7222       !useMaskedInterleavedAccesses(TTI)) {
7223     LLVM_DEBUG(
7224         dbgs()
7225         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7226            "which requires masked-interleaved support.\n");
7227     if (CM.InterleaveInfo.invalidateGroups())
7228       // Invalidating interleave groups also requires invalidating all decisions
7229       // based on them, which includes widening decisions and uniform and scalar
7230       // values.
7231       CM.invalidateCostModelingDecisions();
7232   }
7233 
7234   if (CM.foldTailByMasking())
7235     Legal->prepareToFoldTailByMasking();
7236 
7237   ElementCount MaxUserVF =
7238       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7239   if (UserVF) {
7240     if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7241       reportVectorizationInfo(
7242           "UserVF ignored because it may be larger than the maximal safe VF",
7243           "InvalidUserVF", ORE, OrigLoop);
7244     } else {
7245       assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7246              "VF needs to be a power of two");
7247       // Collect the instructions (and their associated costs) that will be more
7248       // profitable to scalarize.
7249       CM.collectInLoopReductions();
7250       if (CM.selectUserVectorizationFactor(UserVF)) {
7251         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7252         buildVPlansWithVPRecipes(UserVF, UserVF);
7253         LLVM_DEBUG(printPlans(dbgs()));
7254         return;
7255       }
7256       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7257                               "InvalidCost", ORE, OrigLoop);
7258     }
7259   }
7260 
7261   // Collect the Vectorization Factor Candidates.
7262   SmallVector<ElementCount> VFCandidates;
7263   for (auto VF = ElementCount::getFixed(1);
7264        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7265     VFCandidates.push_back(VF);
7266   for (auto VF = ElementCount::getScalable(1);
7267        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7268     VFCandidates.push_back(VF);
7269 
7270   CM.collectInLoopReductions();
7271   for (const auto &VF : VFCandidates) {
7272     // Collect Uniform and Scalar instructions after vectorization with VF.
7273     CM.collectUniformsAndScalars(VF);
7274 
7275     // Collect the instructions (and their associated costs) that will be more
7276     // profitable to scalarize.
7277     if (VF.isVector())
7278       CM.collectInstsToScalarize(VF);
7279   }
7280 
7281   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7282   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7283 
7284   LLVM_DEBUG(printPlans(dbgs()));
7285 }
7286 
7287 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7288                                              ElementCount VF) const {
7289   if (ForceTargetInstructionCost.getNumOccurrences())
7290     return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7291   return CM.getInstructionCost(UI, VF);
7292 }
7293 
7294 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7295   return CM.ValuesToIgnore.contains(UI) ||
7296          (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7297          SkipCostComputation.contains(UI);
7298 }
7299 
7300 InstructionCost
7301 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7302                                           VPCostContext &CostCtx) const {
7303   InstructionCost Cost;
7304   // Cost modeling for inductions is inaccurate in the legacy cost model
7305   // compared to the recipes that are generated. To match here initially during
7306   // VPlan cost model bring up directly use the induction costs from the legacy
7307   // cost model. Note that we do this as pre-processing; the VPlan may not have
7308   // any recipes associated with the original induction increment instruction
7309   // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7310   // the cost of induction phis and increments (both that are represented by
7311   // recipes and those that are not), to avoid distinguishing between them here,
7312   // and skip all recipes that represent induction phis and increments (the
7313   // former case) later on, if they exist, to avoid counting them twice.
7314   // Similarly we pre-compute the cost of any optimized truncates.
7315   // TODO: Switch to more accurate costing based on VPlan.
7316   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7317     Instruction *IVInc = cast<Instruction>(
7318         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7319     SmallVector<Instruction *> IVInsts = {IVInc};
7320     for (unsigned I = 0; I != IVInsts.size(); I++) {
7321       for (Value *Op : IVInsts[I]->operands()) {
7322         auto *OpI = dyn_cast<Instruction>(Op);
7323         if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7324           continue;
7325         IVInsts.push_back(OpI);
7326       }
7327     }
7328     IVInsts.push_back(IV);
7329     for (User *U : IV->users()) {
7330       auto *CI = cast<Instruction>(U);
7331       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7332         continue;
7333       IVInsts.push_back(CI);
7334     }
7335 
7336     // If the vector loop gets executed exactly once with the given VF, ignore
7337     // the costs of comparison and induction instructions, as they'll get
7338     // simplified away.
7339     // TODO: Remove this code after stepping away from the legacy cost model and
7340     // adding code to simplify VPlans before calculating their costs.
7341     auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7342     if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7343       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
7344                                            CostCtx.SkipCostComputation);
7345 
7346     for (Instruction *IVInst : IVInsts) {
7347       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7348         continue;
7349       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7350       LLVM_DEBUG({
7351         dbgs() << "Cost of " << InductionCost << " for VF " << VF
7352                << ": induction instruction " << *IVInst << "\n";
7353       });
7354       Cost += InductionCost;
7355       CostCtx.SkipCostComputation.insert(IVInst);
7356     }
7357   }
7358 
7359   /// Compute the cost of all exiting conditions of the loop using the legacy
7360   /// cost model. This is to match the legacy behavior, which adds the cost of
7361   /// all exit conditions. Note that this over-estimates the cost, as there will
7362   /// be a single condition to control the vector loop.
7363   SmallVector<BasicBlock *> Exiting;
7364   CM.TheLoop->getExitingBlocks(Exiting);
7365   SetVector<Instruction *> ExitInstrs;
7366   // Collect all exit conditions.
7367   for (BasicBlock *EB : Exiting) {
7368     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7369     if (!Term)
7370       continue;
7371     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7372       ExitInstrs.insert(CondI);
7373     }
7374   }
7375   // Compute the cost of all instructions only feeding the exit conditions.
7376   for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7377     Instruction *CondI = ExitInstrs[I];
7378     if (!OrigLoop->contains(CondI) ||
7379         !CostCtx.SkipCostComputation.insert(CondI).second)
7380       continue;
7381     InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7382     LLVM_DEBUG({
7383       dbgs() << "Cost of " << CondICost << " for VF " << VF
7384              << ": exit condition instruction " << *CondI << "\n";
7385     });
7386     Cost += CondICost;
7387     for (Value *Op : CondI->operands()) {
7388       auto *OpI = dyn_cast<Instruction>(Op);
7389       if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7390             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7391                    !ExitInstrs.contains(cast<Instruction>(U));
7392           }))
7393         continue;
7394       ExitInstrs.insert(OpI);
7395     }
7396   }
7397 
7398   // The legacy cost model has special logic to compute the cost of in-loop
7399   // reductions, which may be smaller than the sum of all instructions involved
7400   // in the reduction.
7401   // TODO: Switch to costing based on VPlan once the logic has been ported.
7402   for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7403     if (ForceTargetInstructionCost.getNumOccurrences())
7404       continue;
7405 
7406     if (!CM.isInLoopReduction(RedPhi))
7407       continue;
7408 
7409     const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7410     SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7411                                                  ChainOps.end());
7412     auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7413       return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7414     };
7415     // Also include the operands of instructions in the chain, as the cost-model
7416     // may mark extends as free.
7417     //
7418     // For ARM, some of the instruction can folded into the reducion
7419     // instruction. So we need to mark all folded instructions free.
7420     // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7421     // instruction.
7422     for (auto *ChainOp : ChainOps) {
7423       for (Value *Op : ChainOp->operands()) {
7424         if (auto *I = dyn_cast<Instruction>(Op)) {
7425           ChainOpsAndOperands.insert(I);
7426           if (I->getOpcode() == Instruction::Mul) {
7427             auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7428             auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7429             if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7430                 Ext0->getOpcode() == Ext1->getOpcode()) {
7431               ChainOpsAndOperands.insert(Ext0);
7432               ChainOpsAndOperands.insert(Ext1);
7433             }
7434           }
7435         }
7436       }
7437     }
7438 
7439     // Pre-compute the cost for I, if it has a reduction pattern cost.
7440     for (Instruction *I : ChainOpsAndOperands) {
7441       auto ReductionCost =
7442           CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7443       if (!ReductionCost)
7444         continue;
7445 
7446       assert(!CostCtx.SkipCostComputation.contains(I) &&
7447              "reduction op visited multiple times");
7448       CostCtx.SkipCostComputation.insert(I);
7449       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7450                         << ":\n in-loop reduction " << *I << "\n");
7451       Cost += *ReductionCost;
7452     }
7453   }
7454 
7455   // Pre-compute the costs for branches except for the backedge, as the number
7456   // of replicate regions in a VPlan may not directly match the number of
7457   // branches, which would lead to different decisions.
7458   // TODO: Compute cost of branches for each replicate region in the VPlan,
7459   // which is more accurate than the legacy cost model.
7460   for (BasicBlock *BB : OrigLoop->blocks()) {
7461     if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7462       continue;
7463     CostCtx.SkipCostComputation.insert(BB->getTerminator());
7464     if (BB == OrigLoop->getLoopLatch())
7465       continue;
7466     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7467     Cost += BranchCost;
7468   }
7469 
7470   // Pre-compute costs for instructions that are forced-scalar or profitable to
7471   // scalarize. Their costs will be computed separately in the legacy cost
7472   // model.
7473   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7474     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7475       continue;
7476     CostCtx.SkipCostComputation.insert(ForcedScalar);
7477     InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7478     LLVM_DEBUG({
7479       dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7480              << ": forced scalar " << *ForcedScalar << "\n";
7481     });
7482     Cost += ForcedCost;
7483   }
7484   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7485     if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7486       continue;
7487     CostCtx.SkipCostComputation.insert(Scalarized);
7488     LLVM_DEBUG({
7489       dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7490              << ": profitable to scalarize " << *Scalarized << "\n";
7491     });
7492     Cost += ScalarCost;
7493   }
7494 
7495   return Cost;
7496 }
7497 
7498 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7499                                                ElementCount VF) const {
7500   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7501                         CM.CostKind);
7502   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7503 
7504   // Now compute and add the VPlan-based cost.
7505   Cost += Plan.cost(VF, CostCtx);
7506 #ifndef NDEBUG
7507   unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7508   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7509                     << " (Estimated cost per lane: ");
7510   if (Cost.isValid()) {
7511     double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7512     LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7513   } else /* No point dividing an invalid cost - it will still be invalid */
7514     LLVM_DEBUG(dbgs() << "Invalid");
7515   LLVM_DEBUG(dbgs() << ")\n");
7516 #endif
7517   return Cost;
7518 }
7519 
7520 #ifndef NDEBUG
7521 /// Return true if the original loop \ TheLoop contains any instructions that do
7522 /// not have corresponding recipes in \p Plan and are not marked to be ignored
7523 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
7524 /// cost-model did not account for.
7525 static bool planContainsAdditionalSimplifications(VPlan &Plan,
7526                                                   VPCostContext &CostCtx,
7527                                                   Loop *TheLoop) {
7528   // First collect all instructions for the recipes in Plan.
7529   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7530     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7531       return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7532     if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7533       return &WidenMem->getIngredient();
7534     return nullptr;
7535   };
7536 
7537   DenseSet<Instruction *> SeenInstrs;
7538   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7539   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7540     for (VPRecipeBase &R : *VPBB) {
7541       if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7542         auto *IG = IR->getInterleaveGroup();
7543         unsigned NumMembers = IG->getNumMembers();
7544         for (unsigned I = 0; I != NumMembers; ++I) {
7545           if (Instruction *M = IG->getMember(I))
7546             SeenInstrs.insert(M);
7547         }
7548         continue;
7549       }
7550       // The VPlan-based cost model is more accurate for partial reduction and
7551       // comparing against the legacy cost isn't desirable.
7552       if (isa<VPPartialReductionRecipe>(&R))
7553         return true;
7554       if (Instruction *UI = GetInstructionForCost(&R))
7555         SeenInstrs.insert(UI);
7556     }
7557   }
7558 
7559   // Return true if the loop contains any instructions that are not also part of
7560   // the VPlan or are skipped for VPlan-based cost computations. This indicates
7561   // that the VPlan contains extra simplifications.
7562   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7563                                     TheLoop](BasicBlock *BB) {
7564     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7565       if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7566         return false;
7567       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7568     });
7569   });
7570 }
7571 #endif
7572 
7573 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7574   if (VPlans.empty())
7575     return VectorizationFactor::Disabled();
7576   // If there is a single VPlan with a single VF, return it directly.
7577   VPlan &FirstPlan = *VPlans[0];
7578   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7579     return {*FirstPlan.vectorFactors().begin(), 0, 0};
7580 
7581   LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7582                     << (CM.CostKind == TTI::TCK_RecipThroughput
7583                             ? "Reciprocal Throughput\n"
7584                         : CM.CostKind == TTI::TCK_Latency
7585                             ? "Instruction Latency\n"
7586                         : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7587                         : CM.CostKind == TTI::TCK_SizeAndLatency
7588                             ? "Code Size and Latency\n"
7589                             : "Unknown\n"));
7590 
7591   ElementCount ScalarVF = ElementCount::getFixed(1);
7592   assert(hasPlanWithVF(ScalarVF) &&
7593          "More than a single plan/VF w/o any plan having scalar VF");
7594 
7595   // TODO: Compute scalar cost using VPlan-based cost model.
7596   InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7597   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7598   VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7599   VectorizationFactor BestFactor = ScalarFactor;
7600 
7601   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7602   if (ForceVectorization) {
7603     // Ignore scalar width, because the user explicitly wants vectorization.
7604     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7605     // evaluation.
7606     BestFactor.Cost = InstructionCost::getMax();
7607   }
7608 
7609   for (auto &P : VPlans) {
7610     for (ElementCount VF : P->vectorFactors()) {
7611       if (VF.isScalar())
7612         continue;
7613       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7614         LLVM_DEBUG(
7615             dbgs()
7616             << "LV: Not considering vector loop of width " << VF
7617             << " because it will not generate any vector instructions.\n");
7618         continue;
7619       }
7620 
7621       InstructionCost Cost = cost(*P, VF);
7622       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7623       if (isMoreProfitable(CurrentFactor, BestFactor))
7624         BestFactor = CurrentFactor;
7625 
7626       // If profitable add it to ProfitableVF list.
7627       if (isMoreProfitable(CurrentFactor, ScalarFactor))
7628         ProfitableVFs.push_back(CurrentFactor);
7629     }
7630   }
7631 
7632 #ifndef NDEBUG
7633   // Select the optimal vectorization factor according to the legacy cost-model.
7634   // This is now only used to verify the decisions by the new VPlan-based
7635   // cost-model and will be retired once the VPlan-based cost-model is
7636   // stabilized.
7637   VectorizationFactor LegacyVF = selectVectorizationFactor();
7638   VPlan &BestPlan = getPlanFor(BestFactor.Width);
7639 
7640   // Pre-compute the cost and use it to check if BestPlan contains any
7641   // simplifications not accounted for in the legacy cost model. If that's the
7642   // case, don't trigger the assertion, as the extra simplifications may cause a
7643   // different VF to be picked by the VPlan-based cost model.
7644   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7645                         CM.CostKind);
7646   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7647   assert((BestFactor.Width == LegacyVF.Width ||
7648           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7649                                                 CostCtx, OrigLoop) ||
7650           planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
7651                                                 CostCtx, OrigLoop)) &&
7652          " VPlan cost model and legacy cost model disagreed");
7653   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7654          "when vectorizing, the scalar cost must be computed.");
7655 #endif
7656 
7657   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7658   return BestFactor;
7659 }
7660 
7661 static void addRuntimeUnrollDisableMetaData(Loop *L) {
7662   SmallVector<Metadata *, 4> MDs;
7663   // Reserve first location for self reference to the LoopID metadata node.
7664   MDs.push_back(nullptr);
7665   bool IsUnrollMetadata = false;
7666   MDNode *LoopID = L->getLoopID();
7667   if (LoopID) {
7668     // First find existing loop unrolling disable metadata.
7669     for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7670       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7671       if (MD) {
7672         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7673         IsUnrollMetadata =
7674             S && S->getString().starts_with("llvm.loop.unroll.disable");
7675       }
7676       MDs.push_back(LoopID->getOperand(I));
7677     }
7678   }
7679 
7680   if (!IsUnrollMetadata) {
7681     // Add runtime unroll disable metadata.
7682     LLVMContext &Context = L->getHeader()->getContext();
7683     SmallVector<Metadata *, 1> DisableOperands;
7684     DisableOperands.push_back(
7685         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7686     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7687     MDs.push_back(DisableNode);
7688     MDNode *NewLoopID = MDNode::get(Context, MDs);
7689     // Set operand 0 to refer to the loop id itself.
7690     NewLoopID->replaceOperandWith(0, NewLoopID);
7691     L->setLoopID(NewLoopID);
7692   }
7693 }
7694 
7695 // If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7696 // fix the reduction's scalar PHI node by adding the incoming value from the
7697 // main vector loop.
7698 static void fixReductionScalarResumeWhenVectorizingEpilog(
7699     VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7700     BasicBlock *BypassBlock) {
7701   auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7702   if (!EpiRedResult ||
7703       EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7704     return;
7705 
7706   auto *EpiRedHeaderPhi =
7707       cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7708   const RecurrenceDescriptor &RdxDesc =
7709       EpiRedHeaderPhi->getRecurrenceDescriptor();
7710   Value *MainResumeValue =
7711       EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7712   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7713           RdxDesc.getRecurrenceKind())) {
7714     auto *Cmp = cast<ICmpInst>(MainResumeValue);
7715     assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7716            "AnyOf expected to start with ICMP_NE");
7717     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7718            "AnyOf expected to start by comparing main resume value to original "
7719            "start value");
7720     MainResumeValue = Cmp->getOperand(0);
7721   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
7722                  RdxDesc.getRecurrenceKind())) {
7723     using namespace llvm::PatternMatch;
7724     Value *Cmp, *OrigResumeV;
7725     bool IsExpectedPattern =
7726         match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7727                                         m_Specific(RdxDesc.getSentinelValue()),
7728                                         m_Value(OrigResumeV))) &&
7729         match(Cmp,
7730               m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7731                              m_Specific(RdxDesc.getRecurrenceStartValue())));
7732     assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7733     (void)IsExpectedPattern;
7734     MainResumeValue = OrigResumeV;
7735   }
7736   PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7737 
7738   // When fixing reductions in the epilogue loop we should already have
7739   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7740   // over the incoming values correctly.
7741   using namespace VPlanPatternMatch;
7742   auto IsResumePhi = [](VPUser *U) {
7743     return match(
7744         U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7745   };
7746   assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7747          "ResumePhi must have a single user");
7748   auto *EpiResumePhiVPI =
7749       cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7750   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7751   EpiResumePhi->setIncomingValueForBlock(
7752       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7753 }
7754 
7755 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7756     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7757     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7758     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7759   assert(BestVPlan.hasVF(BestVF) &&
7760          "Trying to execute plan with unsupported VF");
7761   assert(BestVPlan.hasUF(BestUF) &&
7762          "Trying to execute plan with unsupported UF");
7763   assert(
7764       ((VectorizingEpilogue && ExpandedSCEVs) ||
7765        (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7766       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7767 
7768   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7769   // cost model is complete for better cost estimates.
7770   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7771                               OrigLoop->getHeader()->getContext());
7772   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7773   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
7774 
7775   // Perform the actual loop transformation.
7776   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7777                          &BestVPlan, OrigLoop->getParentLoop(),
7778                          Legal->getWidestInductionType());
7779 
7780 #ifdef EXPENSIVE_CHECKS
7781   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7782 #endif
7783 
7784   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7785   // making any changes to the CFG.
7786   if (!BestVPlan.getEntry()->empty())
7787     BestVPlan.getEntry()->execute(&State);
7788 
7789   if (!ILV.getTripCount())
7790     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7791   else
7792     assert(VectorizingEpilogue && "should only re-use the existing trip "
7793                                   "count during epilogue vectorization");
7794 
7795   // 1. Set up the skeleton for vectorization, including vector pre-header and
7796   // middle block. The vector loop is created during VPlan execution.
7797   VPBasicBlock *VectorPH =
7798       cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7799   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7800       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7801   if (VectorizingEpilogue)
7802     VPlanTransforms::removeDeadRecipes(BestVPlan);
7803 
7804   // Only use noalias metadata when using memory checks guaranteeing no overlap
7805   // across all iterations.
7806   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7807   std::unique_ptr<LoopVersioning> LVer = nullptr;
7808   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7809       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7810 
7811     //  We currently don't use LoopVersioning for the actual loop cloning but we
7812     //  still use it to add the noalias metadata.
7813     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7814     //        metadata.
7815     LVer = std::make_unique<LoopVersioning>(
7816         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7817         PSE.getSE());
7818     State.LVer = &*LVer;
7819     State.LVer->prepareNoAliasMetadata();
7820   }
7821 
7822   ILV.printDebugTracesAtStart();
7823 
7824   //===------------------------------------------------===//
7825   //
7826   // Notice: any optimization or new instruction that go
7827   // into the code below should also be implemented in
7828   // the cost-model.
7829   //
7830   //===------------------------------------------------===//
7831 
7832   // 2. Copy and widen instructions from the old loop into the new loop.
7833   BestVPlan.prepareToExecute(
7834       ILV.getTripCount(),
7835       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7836   replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7837 
7838   BestVPlan.execute(&State);
7839 
7840   auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7841   // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7842   // values from the additional bypass block.
7843   if (VectorizingEpilogue) {
7844     assert(!ILV.Legal->hasUncountableEarlyExit() &&
7845            "Epilogue vectorisation not yet supported with early exits");
7846     BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7847     for (VPRecipeBase &R : *MiddleVPBB) {
7848       fixReductionScalarResumeWhenVectorizingEpilog(
7849           &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7850     }
7851     BasicBlock *PH = OrigLoop->getLoopPreheader();
7852     for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7853       auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7854       Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
7855       Inc->setIncomingValueForBlock(BypassBlock, V);
7856     }
7857   }
7858 
7859   // 2.6. Maintain Loop Hints
7860   // Keep all loop hints from the original loop on the vector loop (we'll
7861   // replace the vectorizer-specific hints below).
7862   if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7863     MDNode *OrigLoopID = OrigLoop->getLoopID();
7864 
7865     std::optional<MDNode *> VectorizedLoopID =
7866         makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7867                                         LLVMLoopVectorizeFollowupVectorized});
7868 
7869     VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7870     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7871     if (VectorizedLoopID) {
7872       L->setLoopID(*VectorizedLoopID);
7873     } else {
7874       // Keep all loop hints from the original loop on the vector loop (we'll
7875       // replace the vectorizer-specific hints below).
7876       if (MDNode *LID = OrigLoop->getLoopID())
7877         L->setLoopID(LID);
7878 
7879       LoopVectorizeHints Hints(L, true, *ORE);
7880       Hints.setAlreadyVectorized();
7881     }
7882     TargetTransformInfo::UnrollingPreferences UP;
7883     TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7884     if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7885       addRuntimeUnrollDisableMetaData(L);
7886   }
7887 
7888   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7889   //    predication, updating analyses.
7890   ILV.fixVectorizedLoop(State);
7891 
7892   ILV.printDebugTracesAtEnd();
7893 
7894   // 4. Adjust branch weight of the branch in the middle block.
7895   if (BestVPlan.getVectorLoopRegion()) {
7896     auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7897     auto *MiddleTerm =
7898         cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7899     if (MiddleTerm->isConditional() &&
7900         hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7901       // Assume that `Count % VectorTripCount` is equally distributed.
7902       unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7903       assert(TripCount > 0 && "trip count should not be zero");
7904       const uint32_t Weights[] = {1, TripCount - 1};
7905       setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7906     }
7907   }
7908 
7909   return State.ExpandedSCEVs;
7910 }
7911 
7912 //===--------------------------------------------------------------------===//
7913 // EpilogueVectorizerMainLoop
7914 //===--------------------------------------------------------------------===//
7915 
7916 /// This function is partially responsible for generating the control flow
7917 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7918 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7919     const SCEV2ValueTy &ExpandedSCEVs) {
7920   createVectorLoopSkeleton("");
7921 
7922   // Generate the code to check the minimum iteration count of the vector
7923   // epilogue (see below).
7924   EPI.EpilogueIterationCountCheck =
7925       emitIterationCountCheck(LoopScalarPreHeader, true);
7926   EPI.EpilogueIterationCountCheck->setName("iter.check");
7927 
7928   // Generate the code to check any assumptions that we've made for SCEV
7929   // expressions.
7930   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7931 
7932   // Generate the code that checks at runtime if arrays overlap. We put the
7933   // checks into a separate block to make the more common case of few elements
7934   // faster.
7935   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7936 
7937   // Generate the iteration count check for the main loop, *after* the check
7938   // for the epilogue loop, so that the path-length is shorter for the case
7939   // that goes directly through the vector epilogue. The longer-path length for
7940   // the main loop is compensated for, by the gain from vectorizing the larger
7941   // trip count. Note: the branch will get updated later on when we vectorize
7942   // the epilogue.
7943   EPI.MainLoopIterationCountCheck =
7944       emitIterationCountCheck(LoopScalarPreHeader, false);
7945 
7946   // Generate the induction variable.
7947   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7948 
7949   return LoopVectorPreHeader;
7950 }
7951 
7952 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7953   LLVM_DEBUG({
7954     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7955            << "Main Loop VF:" << EPI.MainLoopVF
7956            << ", Main Loop UF:" << EPI.MainLoopUF
7957            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7958            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7959   });
7960 }
7961 
7962 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7963   DEBUG_WITH_TYPE(VerboseDebug, {
7964     dbgs() << "intermediate fn:\n"
7965            << *OrigLoop->getHeader()->getParent() << "\n";
7966   });
7967 }
7968 
7969 BasicBlock *
7970 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7971                                                     bool ForEpilogue) {
7972   assert(Bypass && "Expected valid bypass basic block.");
7973   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7974   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7975   Value *Count = getTripCount();
7976   // Reuse existing vector loop preheader for TC checks.
7977   // Note that new preheader block is generated for vector loop.
7978   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7979   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7980 
7981   // Generate code to check if the loop's trip count is less than VF * UF of the
7982   // main vector loop.
7983   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7984                                                     : VF.isVector())
7985                ? ICmpInst::ICMP_ULE
7986                : ICmpInst::ICMP_ULT;
7987 
7988   Value *CheckMinIters = Builder.CreateICmp(
7989       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7990       "min.iters.check");
7991 
7992   if (!ForEpilogue)
7993     TCCheckBlock->setName("vector.main.loop.iter.check");
7994 
7995   // Create new preheader for vector loop.
7996   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7997                                    DT, LI, nullptr, "vector.ph");
7998 
7999   if (ForEpilogue) {
8000     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8001                                  DT->getNode(Bypass)->getIDom()) &&
8002            "TC check is expected to dominate Bypass");
8003 
8004     LoopBypassBlocks.push_back(TCCheckBlock);
8005 
8006     // Save the trip count so we don't have to regenerate it in the
8007     // vec.epilog.iter.check. This is safe to do because the trip count
8008     // generated here dominates the vector epilog iter check.
8009     EPI.TripCount = Count;
8010   }
8011 
8012   BranchInst &BI =
8013       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8014   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
8015     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
8016   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
8017 
8018   introduceCheckBlockInVPlan(TCCheckBlock);
8019   return TCCheckBlock;
8020 }
8021 
8022 //===--------------------------------------------------------------------===//
8023 // EpilogueVectorizerEpilogueLoop
8024 //===--------------------------------------------------------------------===//
8025 
8026 /// This function is partially responsible for generating the control flow
8027 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8028 BasicBlock *
8029 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
8030     const SCEV2ValueTy &ExpandedSCEVs) {
8031   createVectorLoopSkeleton("vec.epilog.");
8032 
8033   // Now, compare the remaining count and if there aren't enough iterations to
8034   // execute the vectorized epilogue skip to the scalar part.
8035   LoopVectorPreHeader->setName("vec.epilog.ph");
8036   BasicBlock *VecEpilogueIterationCountCheck =
8037       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
8038                  nullptr, "vec.epilog.iter.check", true);
8039   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
8040                                           VecEpilogueIterationCountCheck);
8041   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
8042 
8043   // Adjust the control flow taking the state info from the main loop
8044   // vectorization into account.
8045   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8046          "expected this to be saved from the previous pass.");
8047   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8048       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8049 
8050   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8051       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8052 
8053   if (EPI.SCEVSafetyCheck)
8054     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8055         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8056   if (EPI.MemSafetyCheck)
8057     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8058         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8059 
8060   DT->changeImmediateDominator(LoopScalarPreHeader,
8061                                EPI.EpilogueIterationCountCheck);
8062   // Keep track of bypass blocks, as they feed start values to the induction and
8063   // reduction phis in the scalar loop preheader.
8064   if (EPI.SCEVSafetyCheck)
8065     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8066   if (EPI.MemSafetyCheck)
8067     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8068   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8069 
8070   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
8071   // reductions which merge control-flow from the latch block and the middle
8072   // block. Update the incoming values here and move the Phi into the preheader.
8073   SmallVector<PHINode *, 4> PhisInBlock;
8074   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8075     PhisInBlock.push_back(&Phi);
8076 
8077   for (PHINode *Phi : PhisInBlock) {
8078     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8079     Phi->replaceIncomingBlockWith(
8080         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8081         VecEpilogueIterationCountCheck);
8082 
8083     // If the phi doesn't have an incoming value from the
8084     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
8085     // value and also those from other check blocks. This is needed for
8086     // reduction phis only.
8087     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
8088           return EPI.EpilogueIterationCountCheck == IncB;
8089         }))
8090       continue;
8091     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8092     if (EPI.SCEVSafetyCheck)
8093       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8094     if (EPI.MemSafetyCheck)
8095       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8096   }
8097 
8098   // Generate bypass values from the additional bypass block. Note that when the
8099   // vectorized epilogue is skipped due to iteration count check, then the
8100   // resume value for the induction variable comes from the trip count of the
8101   // main vector loop, passed as the second argument.
8102   createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
8103   return LoopVectorPreHeader;
8104 }
8105 
8106 BasicBlock *
8107 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8108     BasicBlock *Bypass, BasicBlock *Insert) {
8109 
8110   assert(EPI.TripCount &&
8111          "Expected trip count to have been saved in the first pass.");
8112   assert(
8113       (!isa<Instruction>(EPI.TripCount) ||
8114        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8115       "saved trip count does not dominate insertion point.");
8116   Value *TC = EPI.TripCount;
8117   IRBuilder<> Builder(Insert->getTerminator());
8118   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8119 
8120   // Generate code to check if the loop's trip count is less than VF * UF of the
8121   // vector epilogue loop.
8122   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8123                ? ICmpInst::ICMP_ULE
8124                : ICmpInst::ICMP_ULT;
8125 
8126   Value *CheckMinIters =
8127       Builder.CreateICmp(P, Count,
8128                          createStepForVF(Builder, Count->getType(),
8129                                          EPI.EpilogueVF, EPI.EpilogueUF),
8130                          "min.epilog.iters.check");
8131 
8132   BranchInst &BI =
8133       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8134   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
8135     unsigned MainLoopStep = UF * VF.getKnownMinValue();
8136     unsigned EpilogueLoopStep =
8137         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
8138     // We assume the remaining `Count` is equally distributed in
8139     // [0, MainLoopStep)
8140     // So the probability for `Count < EpilogueLoopStep` should be
8141     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8142     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8143     const uint32_t Weights[] = {EstimatedSkipCount,
8144                                 MainLoopStep - EstimatedSkipCount};
8145     setBranchWeights(BI, Weights, /*IsExpected=*/false);
8146   }
8147   ReplaceInstWithInst(Insert->getTerminator(), &BI);
8148   LoopBypassBlocks.push_back(Insert);
8149 
8150   // A new entry block has been created for the epilogue VPlan. Hook it in, as
8151   // otherwise we would try to modify the entry to the main vector loop.
8152   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8153   VPBasicBlock *OldEntry = Plan.getEntry();
8154   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8155   Plan.setEntry(NewEntry);
8156   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8157 
8158   introduceCheckBlockInVPlan(Insert);
8159   return Insert;
8160 }
8161 
8162 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8163   LLVM_DEBUG({
8164     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8165            << "Epilogue Loop VF:" << EPI.EpilogueVF
8166            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8167   });
8168 }
8169 
8170 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8171   DEBUG_WITH_TYPE(VerboseDebug, {
8172     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8173   });
8174 }
8175 
8176 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8177 VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
8178   std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8179     return getVPValueOrAddLiveIn(Op);
8180   };
8181   return map_range(Operands, Fn);
8182 }
8183 
8184 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
8185   BasicBlock *Src = SI->getParent();
8186   assert(!OrigLoop->isLoopExiting(Src) &&
8187          all_of(successors(Src),
8188                 [this](BasicBlock *Succ) {
8189                   return OrigLoop->getHeader() != Succ;
8190                 }) &&
8191          "unsupported switch either exiting loop or continuing to header");
8192   // Create masks where the terminator in Src is a switch. We create mask for
8193   // all edges at the same time. This is more efficient, as we can create and
8194   // collect compares for all cases once.
8195   VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8196   BasicBlock *DefaultDst = SI->getDefaultDest();
8197   MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
8198   for (auto &C : SI->cases()) {
8199     BasicBlock *Dst = C.getCaseSuccessor();
8200     assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8201     // Cases whose destination is the same as default are redundant and can be
8202     // ignored - they will get there anyhow.
8203     if (Dst == DefaultDst)
8204       continue;
8205     auto &Compares = Dst2Compares[Dst];
8206     VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8207     Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8208   }
8209 
8210   // We need to handle 2 separate cases below for all entries in Dst2Compares,
8211   // which excludes destinations matching the default destination.
8212   VPValue *SrcMask = getBlockInMask(Src);
8213   VPValue *DefaultMask = nullptr;
8214   for (const auto &[Dst, Conds] : Dst2Compares) {
8215     // 1. Dst is not the default destination. Dst is reached if any of the cases
8216     // with destination == Dst are taken. Join the conditions for each case
8217     // whose destination == Dst using an OR.
8218     VPValue *Mask = Conds[0];
8219     for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8220       Mask = Builder.createOr(Mask, V);
8221     if (SrcMask)
8222       Mask = Builder.createLogicalAnd(SrcMask, Mask);
8223     EdgeMaskCache[{Src, Dst}] = Mask;
8224 
8225     // 2. Create the mask for the default destination, which is reached if none
8226     // of the cases with destination != default destination are taken. Join the
8227     // conditions for each case where the destination is != Dst using an OR and
8228     // negate it.
8229     DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8230   }
8231 
8232   if (DefaultMask) {
8233     DefaultMask = Builder.createNot(DefaultMask);
8234     if (SrcMask)
8235       DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8236   }
8237   EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8238 }
8239 
8240 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
8241   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8242 
8243   // Look for cached value.
8244   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8245   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8246   if (ECEntryIt != EdgeMaskCache.end())
8247     return ECEntryIt->second;
8248 
8249   if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8250     createSwitchEdgeMasks(SI);
8251     assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8252     return EdgeMaskCache[Edge];
8253   }
8254 
8255   VPValue *SrcMask = getBlockInMask(Src);
8256 
8257   // The terminator has to be a branch inst!
8258   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8259   assert(BI && "Unexpected terminator found");
8260   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8261     return EdgeMaskCache[Edge] = SrcMask;
8262 
8263   // If source is an exiting block, we know the exit edge is dynamically dead
8264   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8265   // adding uses of an otherwise potentially dead instruction unless we are
8266   // vectorizing a loop with uncountable exits. In that case, we always
8267   // materialize the mask.
8268   if (OrigLoop->isLoopExiting(Src) &&
8269       Src != Legal->getUncountableEarlyExitingBlock())
8270     return EdgeMaskCache[Edge] = SrcMask;
8271 
8272   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8273   assert(EdgeMask && "No Edge Mask found for condition");
8274 
8275   if (BI->getSuccessor(0) != Dst)
8276     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8277 
8278   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8279     // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8280     // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8281     // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8282     EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8283   }
8284 
8285   return EdgeMaskCache[Edge] = EdgeMask;
8286 }
8287 
8288 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
8289   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8290 
8291   // Look for cached value.
8292   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8293   EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8294   assert(ECEntryIt != EdgeMaskCache.end() &&
8295          "looking up mask for edge which has not been created");
8296   return ECEntryIt->second;
8297 }
8298 
8299 void VPRecipeBuilder::createHeaderMask() {
8300   BasicBlock *Header = OrigLoop->getHeader();
8301 
8302   // When not folding the tail, use nullptr to model all-true mask.
8303   if (!CM.foldTailByMasking()) {
8304     BlockMaskCache[Header] = nullptr;
8305     return;
8306   }
8307 
8308   // Introduce the early-exit compare IV <= BTC to form header block mask.
8309   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8310   // constructing the desired canonical IV in the header block as its first
8311   // non-phi instructions.
8312 
8313   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8314   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8315   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8316   HeaderVPBB->insert(IV, NewInsertionPoint);
8317 
8318   VPBuilder::InsertPointGuard Guard(Builder);
8319   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8320   VPValue *BlockMask = nullptr;
8321   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8322   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8323   BlockMaskCache[Header] = BlockMask;
8324 }
8325 
8326 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8327   // Return the cached value.
8328   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8329   assert(BCEntryIt != BlockMaskCache.end() &&
8330          "Trying to access mask for block without one.");
8331   return BCEntryIt->second;
8332 }
8333 
8334 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
8335   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8336   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8337   assert(OrigLoop->getHeader() != BB &&
8338          "Loop header must have cached block mask");
8339 
8340   // All-one mask is modelled as no-mask following the convention for masked
8341   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8342   VPValue *BlockMask = nullptr;
8343   // This is the block mask. We OR all unique incoming edges.
8344   for (auto *Predecessor :
8345        SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) {
8346     VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8347     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8348       BlockMaskCache[BB] = EdgeMask;
8349       return;
8350     }
8351 
8352     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8353       BlockMask = EdgeMask;
8354       continue;
8355     }
8356 
8357     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8358   }
8359 
8360   BlockMaskCache[BB] = BlockMask;
8361 }
8362 
8363 VPWidenMemoryRecipe *
8364 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8365                                   VFRange &Range) {
8366   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8367          "Must be called with either a load or store");
8368 
8369   auto WillWiden = [&](ElementCount VF) -> bool {
8370     LoopVectorizationCostModel::InstWidening Decision =
8371         CM.getWideningDecision(I, VF);
8372     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8373            "CM decision should be taken at this point.");
8374     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8375       return true;
8376     if (CM.isScalarAfterVectorization(I, VF) ||
8377         CM.isProfitableToScalarize(I, VF))
8378       return false;
8379     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8380   };
8381 
8382   if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
8383     return nullptr;
8384 
8385   VPValue *Mask = nullptr;
8386   if (Legal->isMaskRequired(I))
8387     Mask = getBlockInMask(I->getParent());
8388 
8389   // Determine if the pointer operand of the access is either consecutive or
8390   // reverse consecutive.
8391   LoopVectorizationCostModel::InstWidening Decision =
8392       CM.getWideningDecision(I, Range.Start);
8393   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8394   bool Consecutive =
8395       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8396 
8397   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8398   if (Consecutive) {
8399     auto *GEP = dyn_cast<GetElementPtrInst>(
8400         Ptr->getUnderlyingValue()->stripPointerCasts());
8401     VPSingleDefRecipe *VectorPtr;
8402     if (Reverse) {
8403       // When folding the tail, we may compute an address that we don't in the
8404       // original scalar loop and it may not be inbounds. Drop Inbounds in that
8405       // case.
8406       GEPNoWrapFlags Flags =
8407           (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8408               ? GEPNoWrapFlags::none()
8409               : GEPNoWrapFlags::inBounds();
8410       VectorPtr = new VPReverseVectorPointerRecipe(
8411           Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8412     } else {
8413       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8414                                             GEP ? GEP->getNoWrapFlags()
8415                                                 : GEPNoWrapFlags::none(),
8416                                             I->getDebugLoc());
8417     }
8418     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8419     Ptr = VectorPtr;
8420   }
8421   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8422     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8423                                  I->getDebugLoc());
8424 
8425   StoreInst *Store = cast<StoreInst>(I);
8426   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8427                                 Reverse, I->getDebugLoc());
8428 }
8429 
8430 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8431 /// insert a recipe to expand the step for the induction recipe.
8432 static VPWidenIntOrFpInductionRecipe *
8433 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8434                             VPValue *Start, const InductionDescriptor &IndDesc,
8435                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8436   assert(IndDesc.getStartValue() ==
8437          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8438   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8439          "step must be loop invariant");
8440 
8441   VPValue *Step =
8442       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8443   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8444     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8445                                              IndDesc, TruncI,
8446                                              TruncI->getDebugLoc());
8447   }
8448   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8449   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8450                                            IndDesc, Phi->getDebugLoc());
8451 }
8452 
8453 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8454     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
8455 
8456   // Check if this is an integer or fp induction. If so, build the recipe that
8457   // produces its scalar and vector values.
8458   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8459     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8460                                        *PSE.getSE(), *OrigLoop);
8461 
8462   // Check if this is pointer induction. If so, build the recipe for it.
8463   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8464     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8465                                                            *PSE.getSE());
8466     return new VPWidenPointerInductionRecipe(
8467         Phi, Operands[0], Step, *II,
8468         LoopVectorizationPlanner::getDecisionAndClampRange(
8469             [&](ElementCount VF) {
8470               return CM.isScalarAfterVectorization(Phi, VF);
8471             },
8472             Range),
8473         Phi->getDebugLoc());
8474   }
8475   return nullptr;
8476 }
8477 
8478 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8479     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
8480   // Optimize the special case where the source is a constant integer
8481   // induction variable. Notice that we can only optimize the 'trunc' case
8482   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8483   // (c) other casts depend on pointer size.
8484 
8485   // Determine whether \p K is a truncation based on an induction variable that
8486   // can be optimized.
8487   auto IsOptimizableIVTruncate =
8488       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8489     return [=](ElementCount VF) -> bool {
8490       return CM.isOptimizableIVTruncate(K, VF);
8491     };
8492   };
8493 
8494   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8495           IsOptimizableIVTruncate(I), Range)) {
8496 
8497     auto *Phi = cast<PHINode>(I->getOperand(0));
8498     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8499     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8500     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8501                                        *OrigLoop);
8502   }
8503   return nullptr;
8504 }
8505 
8506 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8507                                            ArrayRef<VPValue *> Operands) {
8508   unsigned NumIncoming = Phi->getNumIncomingValues();
8509 
8510   // We know that all PHIs in non-header blocks are converted into selects, so
8511   // we don't have to worry about the insertion order and we can just use the
8512   // builder. At this point we generate the predication tree. There may be
8513   // duplications since this is a simple recursive scan, but future
8514   // optimizations will clean it up.
8515   SmallVector<VPValue *, 2> OperandsWithMask;
8516 
8517   for (unsigned In = 0; In < NumIncoming; In++) {
8518     OperandsWithMask.push_back(Operands[In]);
8519     VPValue *EdgeMask =
8520         getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8521     if (!EdgeMask) {
8522       assert(In == 0 && "Both null and non-null edge masks found");
8523       assert(all_equal(Operands) &&
8524              "Distinct incoming values with one having a full mask");
8525       break;
8526     }
8527     OperandsWithMask.push_back(EdgeMask);
8528   }
8529   return new VPBlendRecipe(Phi, OperandsWithMask);
8530 }
8531 
8532 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8533                                                    ArrayRef<VPValue *> Operands,
8534                                                    VFRange &Range) {
8535   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8536       [this, CI](ElementCount VF) {
8537         return CM.isScalarWithPredication(CI, VF);
8538       },
8539       Range);
8540 
8541   if (IsPredicated)
8542     return nullptr;
8543 
8544   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8545   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8546              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8547              ID == Intrinsic::pseudoprobe ||
8548              ID == Intrinsic::experimental_noalias_scope_decl))
8549     return nullptr;
8550 
8551   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8552 
8553   // Is it beneficial to perform intrinsic call compared to lib call?
8554   bool ShouldUseVectorIntrinsic =
8555       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8556                 [&](ElementCount VF) -> bool {
8557                   return CM.getCallWideningDecision(CI, VF).Kind ==
8558                          LoopVectorizationCostModel::CM_IntrinsicCall;
8559                 },
8560                 Range);
8561   if (ShouldUseVectorIntrinsic)
8562     return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8563                                       CI->getDebugLoc());
8564 
8565   Function *Variant = nullptr;
8566   std::optional<unsigned> MaskPos;
8567   // Is better to call a vectorized version of the function than to to scalarize
8568   // the call?
8569   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8570       [&](ElementCount VF) -> bool {
8571         // The following case may be scalarized depending on the VF.
8572         // The flag shows whether we can use a usual Call for vectorized
8573         // version of the instruction.
8574 
8575         // If we've found a variant at a previous VF, then stop looking. A
8576         // vectorized variant of a function expects input in a certain shape
8577         // -- basically the number of input registers, the number of lanes
8578         // per register, and whether there's a mask required.
8579         // We store a pointer to the variant in the VPWidenCallRecipe, so
8580         // once we have an appropriate variant it's only valid for that VF.
8581         // This will force a different vplan to be generated for each VF that
8582         // finds a valid variant.
8583         if (Variant)
8584           return false;
8585         LoopVectorizationCostModel::CallWideningDecision Decision =
8586             CM.getCallWideningDecision(CI, VF);
8587         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8588           Variant = Decision.Variant;
8589           MaskPos = Decision.MaskPos;
8590           return true;
8591         }
8592 
8593         return false;
8594       },
8595       Range);
8596   if (ShouldUseVectorCall) {
8597     if (MaskPos.has_value()) {
8598       // We have 2 cases that would require a mask:
8599       //   1) The block needs to be predicated, either due to a conditional
8600       //      in the scalar loop or use of an active lane mask with
8601       //      tail-folding, and we use the appropriate mask for the block.
8602       //   2) No mask is required for the block, but the only available
8603       //      vector variant at this VF requires a mask, so we synthesize an
8604       //      all-true mask.
8605       VPValue *Mask = nullptr;
8606       if (Legal->isMaskRequired(CI))
8607         Mask = getBlockInMask(CI->getParent());
8608       else
8609         Mask = Plan.getOrAddLiveIn(
8610             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
8611 
8612       Ops.insert(Ops.begin() + *MaskPos, Mask);
8613     }
8614 
8615     Ops.push_back(Operands.back());
8616     return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8617   }
8618 
8619   return nullptr;
8620 }
8621 
8622 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8623   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8624          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8625   // Instruction should be widened, unless it is scalar after vectorization,
8626   // scalarization is profitable or it is predicated.
8627   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8628     return CM.isScalarAfterVectorization(I, VF) ||
8629            CM.isProfitableToScalarize(I, VF) ||
8630            CM.isScalarWithPredication(I, VF);
8631   };
8632   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8633                                                              Range);
8634 }
8635 
8636 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8637                                            ArrayRef<VPValue *> Operands,
8638                                            VPBasicBlock *VPBB) {
8639   switch (I->getOpcode()) {
8640   default:
8641     return nullptr;
8642   case Instruction::SDiv:
8643   case Instruction::UDiv:
8644   case Instruction::SRem:
8645   case Instruction::URem: {
8646     // If not provably safe, use a select to form a safe divisor before widening the
8647     // div/rem operation itself.  Otherwise fall through to general handling below.
8648     if (CM.isPredicatedInst(I)) {
8649       SmallVector<VPValue *> Ops(Operands);
8650       VPValue *Mask = getBlockInMask(I->getParent());
8651       VPValue *One =
8652           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8653       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8654       Ops[1] = SafeRHS;
8655       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8656     }
8657     [[fallthrough]];
8658   }
8659   case Instruction::Add:
8660   case Instruction::And:
8661   case Instruction::AShr:
8662   case Instruction::FAdd:
8663   case Instruction::FCmp:
8664   case Instruction::FDiv:
8665   case Instruction::FMul:
8666   case Instruction::FNeg:
8667   case Instruction::FRem:
8668   case Instruction::FSub:
8669   case Instruction::ICmp:
8670   case Instruction::LShr:
8671   case Instruction::Mul:
8672   case Instruction::Or:
8673   case Instruction::Select:
8674   case Instruction::Shl:
8675   case Instruction::Sub:
8676   case Instruction::Xor:
8677   case Instruction::Freeze:
8678     SmallVector<VPValue *> NewOps(Operands);
8679     if (Instruction::isBinaryOp(I->getOpcode())) {
8680       // The legacy cost model uses SCEV to check if some of the operands are
8681       // constants. To match the legacy cost model's behavior, use SCEV to try
8682       // to replace operands with constants.
8683       ScalarEvolution &SE = *PSE.getSE();
8684       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8685         Value *V = Op->getUnderlyingValue();
8686         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8687           return Op;
8688         auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8689         if (!C)
8690           return Op;
8691         return Plan.getOrAddLiveIn(C->getValue());
8692       };
8693       // For Mul, the legacy cost model checks both operands.
8694       if (I->getOpcode() == Instruction::Mul)
8695         NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8696       // For other binops, the legacy cost model only checks the second operand.
8697       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8698     }
8699     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8700   };
8701 }
8702 
8703 VPHistogramRecipe *
8704 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8705                                      ArrayRef<VPValue *> Operands) {
8706   // FIXME: Support other operations.
8707   unsigned Opcode = HI->Update->getOpcode();
8708   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8709          "Histogram update operation must be an Add or Sub");
8710 
8711   SmallVector<VPValue *, 3> HGramOps;
8712   // Bucket address.
8713   HGramOps.push_back(Operands[1]);
8714   // Increment value.
8715   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8716 
8717   // In case of predicated execution (due to tail-folding, or conditional
8718   // execution, or both), pass the relevant mask.
8719   if (Legal->isMaskRequired(HI->Store))
8720     HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8721 
8722   return new VPHistogramRecipe(Opcode,
8723                                make_range(HGramOps.begin(), HGramOps.end()),
8724                                HI->Store->getDebugLoc());
8725 }
8726 
8727 void VPRecipeBuilder::fixHeaderPhis() {
8728   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8729   for (VPHeaderPHIRecipe *R : PhisToFix) {
8730     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8731     VPRecipeBase *IncR =
8732         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8733     R->addOperand(IncR->getVPSingleValue());
8734   }
8735 }
8736 
8737 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8738                                                       VFRange &Range) {
8739   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8740       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8741       Range);
8742 
8743   bool IsPredicated = CM.isPredicatedInst(I);
8744 
8745   // Even if the instruction is not marked as uniform, there are certain
8746   // intrinsic calls that can be effectively treated as such, so we check for
8747   // them here. Conservatively, we only do this for scalable vectors, since
8748   // for fixed-width VFs we can always fall back on full scalarization.
8749   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8750     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8751     case Intrinsic::assume:
8752     case Intrinsic::lifetime_start:
8753     case Intrinsic::lifetime_end:
8754       // For scalable vectors if one of the operands is variant then we still
8755       // want to mark as uniform, which will generate one instruction for just
8756       // the first lane of the vector. We can't scalarize the call in the same
8757       // way as for fixed-width vectors because we don't know how many lanes
8758       // there are.
8759       //
8760       // The reasons for doing it this way for scalable vectors are:
8761       //   1. For the assume intrinsic generating the instruction for the first
8762       //      lane is still be better than not generating any at all. For
8763       //      example, the input may be a splat across all lanes.
8764       //   2. For the lifetime start/end intrinsics the pointer operand only
8765       //      does anything useful when the input comes from a stack object,
8766       //      which suggests it should always be uniform. For non-stack objects
8767       //      the effect is to poison the object, which still allows us to
8768       //      remove the call.
8769       IsUniform = true;
8770       break;
8771     default:
8772       break;
8773     }
8774   }
8775   VPValue *BlockInMask = nullptr;
8776   if (!IsPredicated) {
8777     // Finalize the recipe for Instr, first if it is not predicated.
8778     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8779   } else {
8780     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8781     // Instructions marked for predication are replicated and a mask operand is
8782     // added initially. Masked replicate recipes will later be placed under an
8783     // if-then construct to prevent side-effects. Generate recipes to compute
8784     // the block mask for this region.
8785     BlockInMask = getBlockInMask(I->getParent());
8786   }
8787 
8788   // Note that there is some custom logic to mark some intrinsics as uniform
8789   // manually above for scalable vectors, which this assert needs to account for
8790   // as well.
8791   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8792           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8793          "Should not predicate a uniform recipe");
8794   auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8795                                        IsUniform, BlockInMask);
8796   return Recipe;
8797 }
8798 
8799 /// Find all possible partial reductions in the loop and track all of those that
8800 /// are valid so recipes can be formed later.
8801 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8802   // Find all possible partial reductions.
8803   SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
8804       PartialReductionChains;
8805   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
8806     if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8807             getScaledReduction(Phi, RdxDesc, Range))
8808       PartialReductionChains.push_back(*Pair);
8809 
8810   // A partial reduction is invalid if any of its extends are used by
8811   // something that isn't another partial reduction. This is because the
8812   // extends are intended to be lowered along with the reduction itself.
8813 
8814   // Build up a set of partial reduction bin ops for efficient use checking.
8815   SmallSet<User *, 4> PartialReductionBinOps;
8816   for (const auto &[PartialRdx, _] : PartialReductionChains)
8817     PartialReductionBinOps.insert(PartialRdx.BinOp);
8818 
8819   auto ExtendIsOnlyUsedByPartialReductions =
8820       [&PartialReductionBinOps](Instruction *Extend) {
8821         return all_of(Extend->users(), [&](const User *U) {
8822           return PartialReductionBinOps.contains(U);
8823         });
8824       };
8825 
8826   // Check if each use of a chain's two extends is a partial reduction
8827   // and only add those that don't have non-partial reduction users.
8828   for (auto Pair : PartialReductionChains) {
8829     PartialReductionChain Chain = Pair.first;
8830     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8831         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8832       ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
8833   }
8834 }
8835 
8836 std::optional<std::pair<PartialReductionChain, unsigned>>
8837 VPRecipeBuilder::getScaledReduction(PHINode *PHI,
8838                                     const RecurrenceDescriptor &Rdx,
8839                                     VFRange &Range) {
8840   // TODO: Allow scaling reductions when predicating. The select at
8841   // the end of the loop chooses between the phi value and most recent
8842   // reduction result, both of which have different VFs to the active lane
8843   // mask when scaling.
8844   if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
8845     return std::nullopt;
8846 
8847   auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
8848   if (!Update)
8849     return std::nullopt;
8850 
8851   Value *Op = Update->getOperand(0);
8852   Value *PhiOp = Update->getOperand(1);
8853   if (Op == PHI) {
8854     Op = Update->getOperand(1);
8855     PhiOp = Update->getOperand(0);
8856   }
8857   if (PhiOp != PHI)
8858     return std::nullopt;
8859 
8860   auto *BinOp = dyn_cast<BinaryOperator>(Op);
8861   if (!BinOp || !BinOp->hasOneUse())
8862     return std::nullopt;
8863 
8864   using namespace llvm::PatternMatch;
8865   Value *A, *B;
8866   if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8867       !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8868     return std::nullopt;
8869 
8870   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8871   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8872 
8873   TTI::PartialReductionExtendKind OpAExtend =
8874       TargetTransformInfo::getPartialReductionExtendKind(ExtA);
8875   TTI::PartialReductionExtendKind OpBExtend =
8876       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
8877 
8878   PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
8879 
8880   unsigned TargetScaleFactor =
8881       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8882           A->getType()->getPrimitiveSizeInBits());
8883 
8884   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8885           [&](ElementCount VF) {
8886             InstructionCost Cost = TTI->getPartialReductionCost(
8887                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8888                 VF, OpAExtend, OpBExtend,
8889                 std::make_optional(BinOp->getOpcode()));
8890             return Cost.isValid();
8891           },
8892           Range))
8893     return std::make_pair(Chain, TargetScaleFactor);
8894 
8895   return std::nullopt;
8896 }
8897 
8898 VPRecipeBase *
8899 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8900                                         ArrayRef<VPValue *> Operands,
8901                                         VFRange &Range, VPBasicBlock *VPBB) {
8902   // First, check for specific widening recipes that deal with inductions, Phi
8903   // nodes, calls and memory operations.
8904   VPRecipeBase *Recipe;
8905   if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8906     if (Phi->getParent() != OrigLoop->getHeader())
8907       return tryToBlend(Phi, Operands);
8908 
8909     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8910       return Recipe;
8911 
8912     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8913     assert((Legal->isReductionVariable(Phi) ||
8914             Legal->isFixedOrderRecurrence(Phi)) &&
8915            "can only widen reductions and fixed-order recurrences here");
8916     VPValue *StartV = Operands[0];
8917     if (Legal->isReductionVariable(Phi)) {
8918       const RecurrenceDescriptor &RdxDesc =
8919           Legal->getReductionVars().find(Phi)->second;
8920       assert(RdxDesc.getRecurrenceStartValue() ==
8921              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8922 
8923       // If the PHI is used by a partial reduction, set the scale factor.
8924       std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8925           getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
8926       unsigned ScaleFactor = Pair ? Pair->second : 1;
8927       PhiRecipe = new VPReductionPHIRecipe(
8928           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8929           CM.useOrderedReductions(RdxDesc), ScaleFactor);
8930     } else {
8931       // TODO: Currently fixed-order recurrences are modeled as chains of
8932       // first-order recurrences. If there are no users of the intermediate
8933       // recurrences in the chain, the fixed order recurrence should be modeled
8934       // directly, enabling more efficient codegen.
8935       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8936     }
8937 
8938     PhisToFix.push_back(PhiRecipe);
8939     return PhiRecipe;
8940   }
8941 
8942   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8943                                     cast<TruncInst>(Instr), Operands, Range)))
8944     return Recipe;
8945 
8946   // All widen recipes below deal only with VF > 1.
8947   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8948           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8949     return nullptr;
8950 
8951   if (auto *CI = dyn_cast<CallInst>(Instr))
8952     return tryToWidenCall(CI, Operands, Range);
8953 
8954   if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8955     if (auto HistInfo = Legal->getHistogramInfo(SI))
8956       return tryToWidenHistogram(*HistInfo, Operands);
8957 
8958   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8959     return tryToWidenMemory(Instr, Operands, Range);
8960 
8961   if (getScaledReductionForInstr(Instr))
8962     return tryToCreatePartialReduction(Instr, Operands);
8963 
8964   if (!shouldWiden(Instr, Range))
8965     return nullptr;
8966 
8967   if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8968     return new VPWidenGEPRecipe(GEP,
8969                                 make_range(Operands.begin(), Operands.end()));
8970 
8971   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8972     return new VPWidenSelectRecipe(
8973         *SI, make_range(Operands.begin(), Operands.end()));
8974   }
8975 
8976   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8977     return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8978                                  *CI);
8979   }
8980 
8981   return tryToWiden(Instr, Operands, VPBB);
8982 }
8983 
8984 VPRecipeBase *
8985 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8986                                              ArrayRef<VPValue *> Operands) {
8987   assert(Operands.size() == 2 &&
8988          "Unexpected number of operands for partial reduction");
8989 
8990   VPValue *BinOp = Operands[0];
8991   VPValue *Phi = Operands[1];
8992   if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
8993     std::swap(BinOp, Phi);
8994 
8995   return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
8996                                       Reduction);
8997 }
8998 
8999 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
9000                                                         ElementCount MaxVF) {
9001   assert(OrigLoop->isInnermost() && "Inner loop expected.");
9002 
9003   auto MaxVFTimes2 = MaxVF * 2;
9004   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
9005     VFRange SubRange = {VF, MaxVFTimes2};
9006     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
9007       // Now optimize the initial VPlan.
9008       if (!Plan->hasVF(ElementCount::getFixed(1)))
9009         VPlanTransforms::truncateToMinimalBitwidths(*Plan,
9010                                                     CM.getMinimalBitwidths());
9011       VPlanTransforms::optimize(*Plan);
9012       // TODO: try to put it close to addActiveLaneMask().
9013       // Discard the plan if it is not EVL-compatible
9014       if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
9015                                       *Plan, CM.getMaxSafeElements()))
9016         break;
9017       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9018       VPlans.push_back(std::move(Plan));
9019     }
9020     VF = SubRange.End;
9021   }
9022 }
9023 
9024 // Add the necessary canonical IV and branch recipes required to control the
9025 // loop.
9026 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
9027                                   DebugLoc DL) {
9028   Value *StartIdx = ConstantInt::get(IdxTy, 0);
9029   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
9030 
9031   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
9032   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
9033   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
9034   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
9035   Header->insert(CanonicalIVPHI, Header->begin());
9036 
9037   VPBuilder Builder(TopRegion->getExitingBasicBlock());
9038   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
9039   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
9040       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
9041       "index.next");
9042   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
9043 
9044   // Add the BranchOnCount VPInstruction to the latch.
9045   Builder.createNaryOp(VPInstruction::BranchOnCount,
9046                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
9047 }
9048 
9049 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
9050 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
9051 /// the end value of the induction.
9052 static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
9053                                                VPBuilder &VectorPHBuilder,
9054                                                VPBuilder &ScalarPHBuilder,
9055                                                VPTypeAnalysis &TypeInfo,
9056                                                VPValue *VectorTC) {
9057   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
9058   // Truncated wide inductions resume from the last lane of their vector value
9059   // in the last vector iteration which is handled elsewhere.
9060   if (WideIntOrFp && WideIntOrFp->getTruncInst())
9061     return nullptr;
9062 
9063   VPValue *Start = WideIV->getStartValue();
9064   VPValue *Step = WideIV->getStepValue();
9065   const InductionDescriptor &ID = WideIV->getInductionDescriptor();
9066   VPValue *EndValue = VectorTC;
9067   if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
9068     EndValue = VectorPHBuilder.createDerivedIV(
9069         ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
9070         Start, VectorTC, Step);
9071   }
9072 
9073   // EndValue is derived from the vector trip count (which has the same type as
9074   // the widest induction) and thus may be wider than the induction here.
9075   Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
9076   if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
9077     EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
9078                                                 ScalarTypeOfWideIV,
9079                                                 WideIV->getDebugLoc());
9080   }
9081 
9082   auto *ResumePhiRecipe =
9083       ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
9084                                    WideIV->getDebugLoc(), "bc.resume.val");
9085   return ResumePhiRecipe;
9086 }
9087 
9088 /// Create resume phis in the scalar preheader for first-order recurrences,
9089 /// reductions and inductions, and update the VPIRInstructions wrapping the
9090 /// original phis in the scalar header.
9091 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
9092   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9093   auto *ScalarPH = Plan.getScalarPreheader();
9094   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
9095   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9096   VPBuilder VectorPHBuilder(
9097       cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
9098   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9099   VPBuilder ScalarPHBuilder(ScalarPH);
9100   VPValue *OneVPV = Plan.getOrAddLiveIn(
9101       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
9102   for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
9103     auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9104     auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9105     if (!ScalarPhiI)
9106       break;
9107 
9108     auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9109     if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9110       if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
9111               WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9112               &Plan.getVectorTripCount())) {
9113         ScalarPhiIRI->addOperand(ResumePhi);
9114         continue;
9115       }
9116       // TODO: Also handle truncated inductions here. Computing end-values
9117       // separately should be done as VPlan-to-VPlan optimization, after
9118       // legalizing all resume values to use the last lane from the loop.
9119       assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9120              "should only skip truncated wide inductions");
9121       continue;
9122     }
9123 
9124     // The backedge value provides the value to resume coming out of a loop,
9125     // which for FORs is a vector whose last element needs to be extracted. The
9126     // start value provides the value if the loop is bypassed.
9127     bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9128     auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9129     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9130            "Cannot handle loops with uncountable early exits");
9131     if (IsFOR)
9132       ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9133           VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9134           "vector.recur.extract");
9135     StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9136     auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9137         VPInstruction::ResumePhi,
9138         {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9139     ScalarPhiIRI->addOperand(ResumePhiR);
9140   }
9141 }
9142 
9143 /// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
9144 /// either an untruncated wide induction, or if it increments a wide induction
9145 /// by its step.
9146 static bool isOptimizableIVOrUse(VPValue *VPV) {
9147   VPRecipeBase *Def = VPV->getDefiningRecipe();
9148   if (!Def)
9149     return false;
9150   auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
9151   if (WideIV) {
9152     // VPV itself is a wide induction, separately compute the end value for exit
9153     // users if it is not a truncated IV.
9154     return isa<VPWidenPointerInductionRecipe>(WideIV) ||
9155            !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
9156   }
9157 
9158   // Check if VPV is an optimizable induction increment.
9159   if (Def->getNumOperands() != 2)
9160     return false;
9161   WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
9162   if (!WideIV)
9163     WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
9164   if (!WideIV)
9165     return false;
9166 
9167   using namespace VPlanPatternMatch;
9168   auto &ID = WideIV->getInductionDescriptor();
9169 
9170   // Check if VPV increments the induction by the induction step.
9171   VPValue *IVStep = WideIV->getStepValue();
9172   switch (ID.getInductionOpcode()) {
9173   case Instruction::Add:
9174     return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
9175                                                    m_Specific(IVStep)));
9176   case Instruction::FAdd:
9177     return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
9178                                                     m_Specific(IVStep)));
9179   case Instruction::FSub:
9180     return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
9181                                                   m_Specific(IVStep)));
9182   case Instruction::Sub: {
9183     // IVStep will be the negated step of the subtraction. Check if Step == -1 *
9184     // IVStep.
9185     VPValue *Step;
9186     if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
9187         !Step->isLiveIn() || !IVStep->isLiveIn())
9188       return false;
9189     auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
9190     auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
9191     return StepCI && IVStepCI &&
9192            StepCI->getValue() == (-1 * IVStepCI->getValue());
9193   }
9194   default:
9195     return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
9196            match(VPV, m_GetElementPtr(m_Specific(WideIV),
9197                                       m_Specific(WideIV->getStepValue())));
9198   }
9199   llvm_unreachable("should have been covered by switch above");
9200 }
9201 
9202 // Collect VPIRInstructions for phis in the exit blocks that are modeled
9203 // in VPlan and add the exiting VPValue as operand. Some exiting values are not
9204 // modeled explicitly yet and won't be included. Those are un-truncated
9205 // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
9206 // increments.
9207 static SetVector<VPIRInstruction *>
9208 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
9209                          VPlan &Plan) {
9210   auto *MiddleVPBB = Plan.getMiddleBlock();
9211   SetVector<VPIRInstruction *> ExitUsersToFix;
9212   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9213     for (VPRecipeBase &R : *ExitVPBB) {
9214       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9215       if (!ExitIRI)
9216         continue;
9217       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9218       if (!ExitPhi)
9219         break;
9220       for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
9221         BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9222         if (PredVPBB != MiddleVPBB) {
9223           SmallVector<BasicBlock *> ExitingBlocks;
9224           OrigLoop->getExitingBlocks(ExitingBlocks);
9225           assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
9226           ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
9227                                                     : ExitingBlocks[0];
9228         }
9229         Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9230         VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9231         // Exit values for inductions are computed and updated outside of VPlan
9232         // and independent of induction recipes.
9233         // TODO: Compute induction exit values in VPlan.
9234         if (isOptimizableIVOrUse(V) &&
9235             ExitVPBB->getSinglePredecessor() == MiddleVPBB)
9236           continue;
9237         ExitUsersToFix.insert(ExitIRI);
9238         ExitIRI->addOperand(V);
9239       }
9240     }
9241   }
9242   return ExitUsersToFix;
9243 }
9244 
9245 // Add exit values to \p Plan. Extracts are added for each entry in \p
9246 // ExitUsersToFix if needed and their operands are updated. Returns true if all
9247 // exit users can be handled, otherwise return false.
9248 static bool
9249 addUsersInExitBlocks(VPlan &Plan,
9250                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9251   if (ExitUsersToFix.empty())
9252     return true;
9253 
9254   auto *MiddleVPBB = Plan.getMiddleBlock();
9255   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9256 
9257   // Introduce extract for exiting values and update the VPIRInstructions
9258   // modeling the corresponding LCSSA phis.
9259   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9260     for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
9261       // Pass live-in values used by exit phis directly through to their users
9262       // in the exit block.
9263       if (Op->isLiveIn())
9264         continue;
9265 
9266       // Currently only live-ins can be used by exit values from blocks not
9267       // exiting via the vector latch through to the middle block.
9268       if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9269         return false;
9270 
9271       LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
9272       VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
9273                                     {Op, Plan.getOrAddLiveIn(ConstantInt::get(
9274                                              IntegerType::get(Ctx, 32), 1))});
9275       ExitIRI->setOperand(Idx, Ext);
9276     }
9277   }
9278   return true;
9279 }
9280 
9281 /// Handle users in the exit block for first order reductions in the original
9282 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9283 /// users in the original exit block using the VPIRInstruction wrapping to the
9284 /// LCSSA phi.
9285 static void addExitUsersForFirstOrderRecurrences(
9286     VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9287   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9288   auto *ScalarPHVPBB = Plan.getScalarPreheader();
9289   auto *MiddleVPBB = Plan.getMiddleBlock();
9290   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9291   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9292   VPValue *TwoVPV = Plan.getOrAddLiveIn(
9293       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9294 
9295   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9296     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9297     if (!FOR)
9298       continue;
9299 
9300     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9301            "Cannot handle loops with uncountable early exits");
9302 
9303     // This is the second phase of vectorizing first-order recurrences, creating
9304     // extract for users outside the loop. An overview of the transformation is
9305     // described below. Suppose we have the following loop with some use after
9306     // the loop of the last a[i-1],
9307     //
9308     //   for (int i = 0; i < n; ++i) {
9309     //     t = a[i - 1];
9310     //     b[i] = a[i] - t;
9311     //   }
9312     //   use t;
9313     //
9314     // There is a first-order recurrence on "a". For this loop, the shorthand
9315     // scalar IR looks like:
9316     //
9317     //   scalar.ph:
9318     //     s.init = a[-1]
9319     //     br scalar.body
9320     //
9321     //   scalar.body:
9322     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9323     //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9324     //     s2 = a[i]
9325     //     b[i] = s2 - s1
9326     //     br cond, scalar.body, exit.block
9327     //
9328     //   exit.block:
9329     //     use = lcssa.phi [s1, scalar.body]
9330     //
9331     // In this example, s1 is a recurrence because it's value depends on the
9332     // previous iteration. In the first phase of vectorization, we created a
9333     // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9334     // for users in the scalar preheader and exit block.
9335     //
9336     //   vector.ph:
9337     //     v_init = vector(..., ..., ..., a[-1])
9338     //     br vector.body
9339     //
9340     //   vector.body
9341     //     i = phi [0, vector.ph], [i+4, vector.body]
9342     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
9343     //     v2 = a[i, i+1, i+2, i+3]
9344     //     b[i] = v2 - v1
9345     //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9346     //     b[i, i+1, i+2, i+3] = v2 - v1
9347     //     br cond, vector.body, middle.block
9348     //
9349     //   middle.block:
9350     //     vector.recur.extract.for.phi = v2(2)
9351     //     vector.recur.extract = v2(3)
9352     //     br cond, scalar.ph, exit.block
9353     //
9354     //   scalar.ph:
9355     //     scalar.recur.init = phi [vector.recur.extract, middle.block],
9356     //                             [s.init, otherwise]
9357     //     br scalar.body
9358     //
9359     //   scalar.body:
9360     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9361     //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9362     //     s2 = a[i]
9363     //     b[i] = s2 - s1
9364     //     br cond, scalar.body, exit.block
9365     //
9366     //   exit.block:
9367     //     lo = lcssa.phi [s1, scalar.body],
9368     //                    [vector.recur.extract.for.phi, middle.block]
9369     //
9370     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9371     // Extract the penultimate value of the recurrence and use it as operand for
9372     // the VPIRInstruction modeling the phi.
9373     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9374       if (ExitIRI->getOperand(0) != FOR)
9375         continue;
9376       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9377           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9378           "vector.recur.extract.for.phi");
9379       ExitIRI->setOperand(0, PenultimateElement);
9380       ExitUsersToFix.remove(ExitIRI);
9381     }
9382   }
9383 }
9384 
9385 VPlanPtr
9386 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9387 
9388   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9389 
9390   // ---------------------------------------------------------------------------
9391   // Build initial VPlan: Scan the body of the loop in a topological order to
9392   // visit each basic block after having visited its predecessor basic blocks.
9393   // ---------------------------------------------------------------------------
9394 
9395   // Create initial VPlan skeleton, having a basic block for the pre-header
9396   // which contains SCEV expansions that need to happen before the CFG is
9397   // modified; a basic block for the vector pre-header, followed by a region for
9398   // the vector loop, followed by the middle basic block. The skeleton vector
9399   // loop region contains a header and latch basic blocks.
9400 
9401   bool RequiresScalarEpilogueCheck =
9402       LoopVectorizationPlanner::getDecisionAndClampRange(
9403           [this](ElementCount VF) {
9404             return !CM.requiresScalarEpilogue(VF.isVector());
9405           },
9406           Range);
9407   VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(),
9408                                             PSE, RequiresScalarEpilogueCheck,
9409                                             CM.foldTailByMasking(), OrigLoop);
9410 
9411   // Don't use getDecisionAndClampRange here, because we don't know the UF
9412   // so this function is better to be conservative, rather than to split
9413   // it up into different VPlans.
9414   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9415   bool IVUpdateMayOverflow = false;
9416   for (ElementCount VF : Range)
9417     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9418 
9419   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9420   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9421   // Use NUW for the induction increment if we proved that it won't overflow in
9422   // the vector loop or when not folding the tail. In the later case, we know
9423   // that the canonical induction increment will not overflow as the vector trip
9424   // count is >= increment and a multiple of the increment.
9425   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9426   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9427 
9428   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9429                                 Builder);
9430 
9431   // ---------------------------------------------------------------------------
9432   // Pre-construction: record ingredients whose recipes we'll need to further
9433   // process after constructing the initial VPlan.
9434   // ---------------------------------------------------------------------------
9435 
9436   // For each interleave group which is relevant for this (possibly trimmed)
9437   // Range, add it to the set of groups to be later applied to the VPlan and add
9438   // placeholders for its members' Recipes which we'll be replacing with a
9439   // single VPInterleaveRecipe.
9440   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9441     auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9442       bool Result = (VF.isVector() && // Query is illegal for VF == 1
9443                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
9444                          LoopVectorizationCostModel::CM_Interleave);
9445       // For scalable vectors, the only interleave factor currently supported
9446       // must be power of 2 since we require the (de)interleave2 intrinsics
9447       // instead of shufflevectors.
9448       assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
9449              "Unsupported interleave factor for scalable vectors");
9450       return Result;
9451     };
9452     if (!getDecisionAndClampRange(ApplyIG, Range))
9453       continue;
9454     InterleaveGroups.insert(IG);
9455   }
9456 
9457   // ---------------------------------------------------------------------------
9458   // Construct recipes for the instructions in the loop
9459   // ---------------------------------------------------------------------------
9460 
9461   // Scan the body of the loop in a topological order to visit each basic block
9462   // after having visited its predecessor basic blocks.
9463   LoopBlocksDFS DFS(OrigLoop);
9464   DFS.perform(LI);
9465 
9466   VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9467   VPBasicBlock *VPBB = HeaderVPBB;
9468   BasicBlock *HeaderBB = OrigLoop->getHeader();
9469   bool NeedsMasks =
9470       CM.foldTailByMasking() ||
9471       any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9472         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9473         return Legal->blockNeedsPredication(BB) || NeedsBlends;
9474       });
9475 
9476   RecipeBuilder.collectScaledReductions(Range);
9477 
9478   auto *MiddleVPBB = Plan->getMiddleBlock();
9479   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9480   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9481     // Relevant instructions from basic block BB will be grouped into VPRecipe
9482     // ingredients and fill a new VPBasicBlock.
9483     if (VPBB != HeaderVPBB)
9484       VPBB->setName(BB->getName());
9485     Builder.setInsertPoint(VPBB);
9486 
9487     if (VPBB == HeaderVPBB)
9488       RecipeBuilder.createHeaderMask();
9489     else if (NeedsMasks)
9490       RecipeBuilder.createBlockInMask(BB);
9491 
9492     // Introduce each ingredient into VPlan.
9493     // TODO: Model and preserve debug intrinsics in VPlan.
9494     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9495       Instruction *Instr = &I;
9496       SmallVector<VPValue *, 4> Operands;
9497       auto *Phi = dyn_cast<PHINode>(Instr);
9498       if (Phi && Phi->getParent() == HeaderBB) {
9499         Operands.push_back(Plan->getOrAddLiveIn(
9500             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9501       } else {
9502         auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9503         Operands = {OpRange.begin(), OpRange.end()};
9504       }
9505 
9506       // The stores with invariant address inside the loop will be deleted, and
9507       // in the exit block, a uniform store recipe will be created for the final
9508       // invariant store of the reduction.
9509       StoreInst *SI;
9510       if ((SI = dyn_cast<StoreInst>(&I)) &&
9511           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9512         // Only create recipe for the final invariant store of the reduction.
9513         if (!Legal->isInvariantStoreOfReduction(SI))
9514           continue;
9515         auto *Recipe = new VPReplicateRecipe(
9516             SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9517             true /* IsUniform */);
9518         Recipe->insertBefore(*MiddleVPBB, MBIP);
9519         continue;
9520       }
9521 
9522       VPRecipeBase *Recipe =
9523           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9524       if (!Recipe)
9525         Recipe = RecipeBuilder.handleReplication(Instr, Range);
9526 
9527       RecipeBuilder.setRecipe(Instr, Recipe);
9528       if (isa<VPHeaderPHIRecipe>(Recipe)) {
9529         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9530         // the following cases, VPHeaderPHIRecipes may be created after non-phi
9531         // recipes and need to be moved to the phi section of HeaderVPBB:
9532         // * tail-folding (non-phi recipes computing the header mask are
9533         // introduced earlier than regular header phi recipes, and should appear
9534         // after them)
9535         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9536 
9537         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9538                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9539                "unexpected recipe needs moving");
9540         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9541       } else
9542         VPBB->appendRecipe(Recipe);
9543     }
9544 
9545     VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9546     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9547   }
9548 
9549   // After here, VPBB should not be used.
9550   VPBB = nullptr;
9551 
9552   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9553          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9554          "entry block must be set to a VPRegionBlock having a non-empty entry "
9555          "VPBasicBlock");
9556   RecipeBuilder.fixHeaderPhis();
9557 
9558   // Update wide induction increments to use the same step as the corresponding
9559   // wide induction. This enables detecting induction increments directly in
9560   // VPlan and removes redundant splats.
9561   for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9562     auto *IVInc = cast<Instruction>(
9563         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9564     if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9565       continue;
9566     VPWidenInductionRecipe *WideIV =
9567         cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9568     VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9569     R->setOperand(1, WideIV->getStepValue());
9570   }
9571 
9572   if (auto *UncountableExitingBlock =
9573           Legal->getUncountableEarlyExitingBlock()) {
9574     VPlanTransforms::handleUncountableEarlyExit(
9575         *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9576   }
9577   addScalarResumePhis(RecipeBuilder, *Plan);
9578   SetVector<VPIRInstruction *> ExitUsersToFix =
9579       collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9580   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9581   if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9582     reportVectorizationFailure(
9583         "Some exit values in loop with uncountable exit not supported yet",
9584         "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9585     return nullptr;
9586   }
9587 
9588   // ---------------------------------------------------------------------------
9589   // Transform initial VPlan: Apply previously taken decisions, in order, to
9590   // bring the VPlan to its final state.
9591   // ---------------------------------------------------------------------------
9592 
9593   // Adjust the recipes for any inloop reductions.
9594   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9595 
9596   // Interleave memory: for each Interleave Group we marked earlier as relevant
9597   // for this VPlan, replace the Recipes widening its memory instructions with a
9598   // single VPInterleaveRecipe at its insertion point.
9599   VPlanTransforms::createInterleaveGroups(
9600       *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
9601 
9602   for (ElementCount VF : Range)
9603     Plan->addVF(VF);
9604   Plan->setName("Initial VPlan");
9605 
9606   // Replace VPValues for known constant strides guaranteed by predicate scalar
9607   // evolution.
9608   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9609     auto *R = cast<VPRecipeBase>(&U);
9610     return R->getParent()->getParent() ||
9611            R->getParent() ==
9612                Plan->getVectorLoopRegion()->getSinglePredecessor();
9613   };
9614   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9615     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9616     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9617     // Only handle constant strides for now.
9618     if (!ScevStride)
9619       continue;
9620 
9621     auto *CI = Plan->getOrAddLiveIn(
9622         ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9623     if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9624       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9625 
9626     // The versioned value may not be used in the loop directly but through a
9627     // sext/zext. Add new live-ins in those cases.
9628     for (Value *U : StrideV->users()) {
9629       if (!isa<SExtInst, ZExtInst>(U))
9630         continue;
9631       VPValue *StrideVPV = Plan->getLiveIn(U);
9632       if (!StrideVPV)
9633         continue;
9634       unsigned BW = U->getType()->getScalarSizeInBits();
9635       APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9636                                  : ScevStride->getAPInt().zext(BW);
9637       VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9638       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9639     }
9640   }
9641 
9642   VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
9643     return Legal->blockNeedsPredication(BB);
9644   });
9645 
9646   // Sink users of fixed-order recurrence past the recipe defining the previous
9647   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9648   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
9649     return nullptr;
9650 
9651   if (useActiveLaneMask(Style)) {
9652     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9653     // TailFoldingStyle is visible there.
9654     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9655     bool WithoutRuntimeCheck =
9656         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
9657     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9658                                        WithoutRuntimeCheck);
9659   }
9660 
9661   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9662   return Plan;
9663 }
9664 
9665 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9666   // Outer loop handling: They may require CFG and instruction level
9667   // transformations before even evaluating whether vectorization is profitable.
9668   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9669   // the vectorization pipeline.
9670   assert(!OrigLoop->isInnermost());
9671   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9672 
9673   // Create new empty VPlan
9674   auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9675                                         true, false, OrigLoop);
9676 
9677   // Build hierarchical CFG
9678   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9679   HCFGBuilder.buildHierarchicalCFG();
9680 
9681   for (ElementCount VF : Range)
9682     Plan->addVF(VF);
9683 
9684   VPlanTransforms::VPInstructionsToVPRecipes(
9685       Plan,
9686       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9687       *PSE.getSE(), *TLI);
9688 
9689   // Remove the existing terminator of the exiting block of the top-most region.
9690   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9691   auto *Term =
9692       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9693   Term->eraseFromParent();
9694 
9695   // Tail folding is not supported for outer loops, so the induction increment
9696   // is guaranteed to not wrap.
9697   bool HasNUW = true;
9698   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9699                         DebugLoc());
9700 
9701   // Collect mapping of IR header phis to header phi recipes, to be used in
9702   // addScalarResumePhis.
9703   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9704                                 Builder);
9705   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9706     if (isa<VPCanonicalIVPHIRecipe>(&R))
9707       continue;
9708     auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9709     RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9710   }
9711   addScalarResumePhis(RecipeBuilder, *Plan);
9712 
9713   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9714   return Plan;
9715 }
9716 
9717 // Adjust the recipes for reductions. For in-loop reductions the chain of
9718 // instructions leading from the loop exit instr to the phi need to be converted
9719 // to reductions, with one operand being vector and the other being the scalar
9720 // reduction chain. For other reductions, a select is introduced between the phi
9721 // and users outside the vector region when folding the tail.
9722 //
9723 // A ComputeReductionResult recipe is added to the middle block, also for
9724 // in-loop reductions which compute their result in-loop, because generating
9725 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9726 //
9727 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9728 // with a boolean reduction phi node to check if the condition is true in any
9729 // iteration. The final value is selected by the final ComputeReductionResult.
9730 void LoopVectorizationPlanner::adjustRecipesForReductions(
9731     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9732   using namespace VPlanPatternMatch;
9733   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9734   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9735   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9736   SmallVector<VPRecipeBase *> ToDelete;
9737 
9738   for (VPRecipeBase &R : Header->phis()) {
9739     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9740     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9741       continue;
9742 
9743     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9744     RecurKind Kind = RdxDesc.getRecurrenceKind();
9745     assert(
9746         !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9747         !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
9748         "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9749 
9750     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9751     SetVector<VPSingleDefRecipe *> Worklist;
9752     Worklist.insert(PhiR);
9753     for (unsigned I = 0; I != Worklist.size(); ++I) {
9754       VPSingleDefRecipe *Cur = Worklist[I];
9755       for (VPUser *U : Cur->users()) {
9756         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9757         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9758           assert((UserRecipe->getParent() == MiddleVPBB ||
9759                   UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9760                  "U must be either in the loop region, the middle block or the "
9761                  "scalar preheader.");
9762           continue;
9763         }
9764         Worklist.insert(UserRecipe);
9765       }
9766     }
9767 
9768     // Visit operation "Links" along the reduction chain top-down starting from
9769     // the phi until LoopExitValue. We keep track of the previous item
9770     // (PreviousLink) to tell which of the two operands of a Link will remain
9771     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9772     // the select instructions. Blend recipes of in-loop reduction phi's  will
9773     // get folded to their non-phi operand, as the reduction recipe handles the
9774     // condition directly.
9775     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9776     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9777       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9778 
9779       // Index of the first operand which holds a non-mask vector operand.
9780       unsigned IndexOfFirstOperand;
9781       // Recognize a call to the llvm.fmuladd intrinsic.
9782       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9783       VPValue *VecOp;
9784       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9785       if (IsFMulAdd) {
9786         assert(
9787             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9788             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9789         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9790                 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9791                CurrentLink->getOperand(2) == PreviousLink &&
9792                "expected a call where the previous link is the added operand");
9793 
9794         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9795         // need to create an fmul recipe (multiplying the first two operands of
9796         // the fmuladd together) to use as the vector operand for the fadd
9797         // reduction.
9798         VPInstruction *FMulRecipe = new VPInstruction(
9799             Instruction::FMul,
9800             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9801             CurrentLinkI->getFastMathFlags());
9802         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9803         VecOp = FMulRecipe;
9804       } else {
9805         auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9806         if (PhiR->isInLoop() && Blend) {
9807           assert(Blend->getNumIncomingValues() == 2 &&
9808                  "Blend must have 2 incoming values");
9809           if (Blend->getIncomingValue(0) == PhiR)
9810             Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9811           else {
9812             assert(Blend->getIncomingValue(1) == PhiR &&
9813                    "PhiR must be an operand of the blend");
9814             Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9815           }
9816           continue;
9817         }
9818 
9819         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9820           if (isa<VPWidenRecipe>(CurrentLink)) {
9821             assert(isa<CmpInst>(CurrentLinkI) &&
9822                    "need to have the compare of the select");
9823             continue;
9824           }
9825           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9826                  "must be a select recipe");
9827           IndexOfFirstOperand = 1;
9828         } else {
9829           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9830                  "Expected to replace a VPWidenSC");
9831           IndexOfFirstOperand = 0;
9832         }
9833         // Note that for non-commutable operands (cmp-selects), the semantics of
9834         // the cmp-select are captured in the recurrence kind.
9835         unsigned VecOpId =
9836             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9837                 ? IndexOfFirstOperand + 1
9838                 : IndexOfFirstOperand;
9839         VecOp = CurrentLink->getOperand(VecOpId);
9840         assert(VecOp != PreviousLink &&
9841                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9842                                        (VecOpId - IndexOfFirstOperand)) ==
9843                    PreviousLink &&
9844                "PreviousLink must be the operand other than VecOp");
9845       }
9846 
9847       BasicBlock *BB = CurrentLinkI->getParent();
9848       VPValue *CondOp = nullptr;
9849       if (CM.blockNeedsPredicationForAnyReason(BB))
9850         CondOp = RecipeBuilder.getBlockInMask(BB);
9851 
9852       auto *RedRecipe = new VPReductionRecipe(
9853           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9854           CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9855       // Append the recipe to the end of the VPBasicBlock because we need to
9856       // ensure that it comes after all of it's inputs, including CondOp.
9857       // Delete CurrentLink as it will be invalid if its operand is replaced
9858       // with a reduction defined at the bottom of the block in the next link.
9859       LinkVPBB->appendRecipe(RedRecipe);
9860       CurrentLink->replaceAllUsesWith(RedRecipe);
9861       ToDelete.push_back(CurrentLink);
9862       PreviousLink = RedRecipe;
9863     }
9864   }
9865   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9866   Builder.setInsertPoint(&*LatchVPBB->begin());
9867   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9868   for (VPRecipeBase &R :
9869        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9870     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9871     if (!PhiR)
9872       continue;
9873 
9874     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9875     // If tail is folded by masking, introduce selects between the phi
9876     // and the users outside the vector region of each reduction, at the
9877     // beginning of the dedicated latch block.
9878     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9879     auto *NewExitingVPV = PhiR->getBackedgeValue();
9880     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9881       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9882       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9883              "reduction recipe must be defined before latch");
9884       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9885       std::optional<FastMathFlags> FMFs =
9886           PhiTy->isFloatingPointTy()
9887               ? std::make_optional(RdxDesc.getFastMathFlags())
9888               : std::nullopt;
9889       NewExitingVPV =
9890           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9891       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9892         return isa<VPInstruction>(&U) &&
9893                cast<VPInstruction>(&U)->getOpcode() ==
9894                    VPInstruction::ComputeReductionResult;
9895       });
9896       if (CM.usePredicatedReductionSelect(
9897               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9898         PhiR->setOperand(1, NewExitingVPV);
9899     }
9900 
9901     // If the vector reduction can be performed in a smaller type, we truncate
9902     // then extend the loop exit value to enable InstCombine to evaluate the
9903     // entire expression in the smaller type.
9904     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9905     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9906         !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9907             RdxDesc.getRecurrenceKind())) {
9908       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9909       Type *RdxTy = RdxDesc.getRecurrenceType();
9910       auto *Trunc =
9911           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9912       auto *Extnd =
9913           RdxDesc.isSigned()
9914               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9915               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9916 
9917       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9918       Extnd->insertAfter(Trunc);
9919       if (PhiR->getOperand(1) == NewExitingVPV)
9920         PhiR->setOperand(1, Extnd->getVPSingleValue());
9921       NewExitingVPV = Extnd;
9922     }
9923 
9924     // We want code in the middle block to appear to execute on the location of
9925     // the scalar loop's latch terminator because: (a) it is all compiler
9926     // generated, (b) these instructions are always executed after evaluating
9927     // the latch conditional branch, and (c) other passes may add new
9928     // predecessors which terminate on this line. This is the easiest way to
9929     // ensure we don't accidentally cause an extra step back into the loop while
9930     // debugging.
9931     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9932 
9933     // TODO: At the moment ComputeReductionResult also drives creation of the
9934     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9935     // even for in-loop reductions, until the reduction resume value handling is
9936     // also modeled in VPlan.
9937     auto *FinalReductionResult = new VPInstruction(
9938         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9939     // Update all users outside the vector region.
9940     OrigExitingVPV->replaceUsesWithIf(
9941         FinalReductionResult, [](VPUser &User, unsigned) {
9942           auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9943           return Parent && !Parent->getParent();
9944         });
9945     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9946 
9947     // Adjust AnyOf reductions; replace the reduction phi for the selected value
9948     // with a boolean reduction phi node to check if the condition is true in
9949     // any iteration. The final value is selected by the final
9950     // ComputeReductionResult.
9951     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9952             RdxDesc.getRecurrenceKind())) {
9953       auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9954         return isa<VPWidenSelectRecipe>(U) ||
9955                (isa<VPReplicateRecipe>(U) &&
9956                 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9957                     Instruction::Select);
9958       }));
9959       VPValue *Cmp = Select->getOperand(0);
9960       // If the compare is checking the reduction PHI node, adjust it to check
9961       // the start value.
9962       if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9963         for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9964           if (CmpR->getOperand(I) == PhiR)
9965             CmpR->setOperand(I, PhiR->getStartValue());
9966       }
9967       VPBuilder::InsertPointGuard Guard(Builder);
9968       Builder.setInsertPoint(Select);
9969 
9970       // If the true value of the select is the reduction phi, the new value is
9971       // selected if the negated condition is true in any iteration.
9972       if (Select->getOperand(1) == PhiR)
9973         Cmp = Builder.createNot(Cmp);
9974       VPValue *Or = Builder.createOr(PhiR, Cmp);
9975       Select->getVPSingleValue()->replaceAllUsesWith(Or);
9976       // Delete Select now that it has invalid types.
9977       ToDelete.push_back(Select);
9978 
9979       // Convert the reduction phi to operate on bools.
9980       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9981                               OrigLoop->getHeader()->getContext())));
9982       continue;
9983     }
9984 
9985     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
9986             RdxDesc.getRecurrenceKind())) {
9987       // Adjust the start value for FindLastIV recurrences to use the sentinel
9988       // value after generating the ResumePhi recipe, which uses the original
9989       // start value.
9990       PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9991     }
9992   }
9993 
9994   VPlanTransforms::clearReductionWrapFlags(*Plan);
9995   for (VPRecipeBase *R : ToDelete)
9996     R->eraseFromParent();
9997 }
9998 
9999 void VPDerivedIVRecipe::execute(VPTransformState &State) {
10000   assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
10001 
10002   // Fast-math-flags propagate from the original induction instruction.
10003   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
10004   if (FPBinOp)
10005     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
10006 
10007   Value *Step = State.get(getStepValue(), VPLane(0));
10008   Value *Index = State.get(getOperand(1), VPLane(0));
10009   Value *DerivedIV = emitTransformedIndex(
10010       State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
10011       cast_if_present<BinaryOperator>(FPBinOp));
10012   DerivedIV->setName(Name);
10013   // If index is the vector trip count, the concrete value will only be set in
10014   // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
10015   // TODO: Remove the special case for the vector trip count once it is computed
10016   // in VPlan and can be used during VPlan simplification.
10017   assert((DerivedIV != Index ||
10018           getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
10019          "IV didn't need transforming?");
10020   State.set(this, DerivedIV, VPLane(0));
10021 }
10022 
10023 void VPReplicateRecipe::execute(VPTransformState &State) {
10024   Instruction *UI = getUnderlyingInstr();
10025   if (State.Lane) { // Generate a single instance.
10026     assert((State.VF.isScalar() || !isUniform()) &&
10027            "uniform recipe shouldn't be predicated");
10028     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
10029     State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
10030     // Insert scalar instance packing it into a vector.
10031     if (State.VF.isVector() && shouldPack()) {
10032       // If we're constructing lane 0, initialize to start from poison.
10033       if (State.Lane->isFirstLane()) {
10034         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
10035         Value *Poison = PoisonValue::get(
10036             VectorType::get(UI->getType(), State.VF));
10037         State.set(this, Poison);
10038       }
10039       State.packScalarIntoVectorValue(this, *State.Lane);
10040     }
10041     return;
10042   }
10043 
10044   if (IsUniform) {
10045     // Uniform within VL means we need to generate lane 0.
10046     State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
10047     return;
10048   }
10049 
10050   // A store of a loop varying value to a uniform address only needs the last
10051   // copy of the store.
10052   if (isa<StoreInst>(UI) &&
10053       vputils::isUniformAfterVectorization(getOperand(1))) {
10054     auto Lane = VPLane::getLastLaneForVF(State.VF);
10055     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10056     return;
10057   }
10058 
10059   // Generate scalar instances for all VF lanes.
10060   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
10061   const unsigned EndLane = State.VF.getKnownMinValue();
10062   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
10063     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10064 }
10065 
10066 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10067 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10068 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10069 // for predication.
10070 static ScalarEpilogueLowering getScalarEpilogueLowering(
10071     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10072     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10073     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
10074   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10075   // don't look at hints or options, and don't request a scalar epilogue.
10076   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10077   // LoopAccessInfo (due to code dependency and not being able to reliably get
10078   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10079   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10080   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10081   // back to the old way and vectorize with versioning when forced. See D81345.)
10082   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10083                                                       PGSOQueryType::IRPass) &&
10084                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10085     return CM_ScalarEpilogueNotAllowedOptSize;
10086 
10087   // 2) If set, obey the directives
10088   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10089     switch (PreferPredicateOverEpilogue) {
10090     case PreferPredicateTy::ScalarEpilogue:
10091       return CM_ScalarEpilogueAllowed;
10092     case PreferPredicateTy::PredicateElseScalarEpilogue:
10093       return CM_ScalarEpilogueNotNeededUsePredicate;
10094     case PreferPredicateTy::PredicateOrDontVectorize:
10095       return CM_ScalarEpilogueNotAllowedUsePredicate;
10096     };
10097   }
10098 
10099   // 3) If set, obey the hints
10100   switch (Hints.getPredicate()) {
10101   case LoopVectorizeHints::FK_Enabled:
10102     return CM_ScalarEpilogueNotNeededUsePredicate;
10103   case LoopVectorizeHints::FK_Disabled:
10104     return CM_ScalarEpilogueAllowed;
10105   };
10106 
10107   // 4) if the TTI hook indicates this is profitable, request predication.
10108   TailFoldingInfo TFI(TLI, &LVL, IAI);
10109   if (TTI->preferPredicateOverEpilogue(&TFI))
10110     return CM_ScalarEpilogueNotNeededUsePredicate;
10111 
10112   return CM_ScalarEpilogueAllowed;
10113 }
10114 
10115 // Process the loop in the VPlan-native vectorization path. This path builds
10116 // VPlan upfront in the vectorization pipeline, which allows to apply
10117 // VPlan-to-VPlan transformations from the very beginning without modifying the
10118 // input LLVM IR.
10119 static bool processLoopInVPlanNativePath(
10120     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10121     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10122     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10123     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10124     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10125     LoopVectorizationRequirements &Requirements) {
10126 
10127   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10128     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10129     return false;
10130   }
10131   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10132   Function *F = L->getHeader()->getParent();
10133   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10134 
10135   ScalarEpilogueLowering SEL =
10136       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
10137 
10138   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10139                                 &Hints, IAI);
10140   // Use the planner for outer loop vectorization.
10141   // TODO: CM is not used at this point inside the planner. Turn CM into an
10142   // optional argument if we don't need it in the future.
10143   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
10144                                ORE);
10145 
10146   // Get user vectorization factor.
10147   ElementCount UserVF = Hints.getWidth();
10148 
10149   CM.collectElementTypesForWidening();
10150 
10151   // Plan how to best vectorize, return the best VF and its cost.
10152   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10153 
10154   // If we are stress testing VPlan builds, do not attempt to generate vector
10155   // code. Masked vector code generation support will follow soon.
10156   // Also, do not attempt to vectorize if no vector code will be produced.
10157   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
10158     return false;
10159 
10160   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10161 
10162   {
10163     bool AddBranchWeights =
10164         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10165     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10166                              AddBranchWeights, CM.CostKind);
10167     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10168                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10169     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10170                       << L->getHeader()->getParent()->getName() << "\"\n");
10171     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10172   }
10173 
10174   reportVectorization(ORE, L, VF, 1);
10175 
10176   // Mark the loop as already vectorized to avoid vectorizing again.
10177   Hints.setAlreadyVectorized();
10178   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10179   return true;
10180 }
10181 
10182 // Emit a remark if there are stores to floats that required a floating point
10183 // extension. If the vectorized loop was generated with floating point there
10184 // will be a performance penalty from the conversion overhead and the change in
10185 // the vector width.
10186 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10187   SmallVector<Instruction *, 4> Worklist;
10188   for (BasicBlock *BB : L->getBlocks()) {
10189     for (Instruction &Inst : *BB) {
10190       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10191         if (S->getValueOperand()->getType()->isFloatTy())
10192           Worklist.push_back(S);
10193       }
10194     }
10195   }
10196 
10197   // Traverse the floating point stores upwards searching, for floating point
10198   // conversions.
10199   SmallPtrSet<const Instruction *, 4> Visited;
10200   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10201   while (!Worklist.empty()) {
10202     auto *I = Worklist.pop_back_val();
10203     if (!L->contains(I))
10204       continue;
10205     if (!Visited.insert(I).second)
10206       continue;
10207 
10208     // Emit a remark if the floating point store required a floating
10209     // point conversion.
10210     // TODO: More work could be done to identify the root cause such as a
10211     // constant or a function return type and point the user to it.
10212     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10213       ORE->emit([&]() {
10214         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10215                                           I->getDebugLoc(), L->getHeader())
10216                << "floating point conversion changes vector width. "
10217                << "Mixed floating point precision requires an up/down "
10218                << "cast that will negatively impact performance.";
10219       });
10220 
10221     for (Use &Op : I->operands())
10222       if (auto *OpI = dyn_cast<Instruction>(Op))
10223         Worklist.push_back(OpI);
10224   }
10225 }
10226 
10227 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10228                                        VectorizationFactor &VF, Loop *L,
10229                                        const TargetTransformInfo &TTI,
10230                                        PredicatedScalarEvolution &PSE,
10231                                        ScalarEpilogueLowering SEL) {
10232   InstructionCost CheckCost = Checks.getCost();
10233   if (!CheckCost.isValid())
10234     return false;
10235 
10236   // When interleaving only scalar and vector cost will be equal, which in turn
10237   // would lead to a divide by 0. Fall back to hard threshold.
10238   if (VF.Width.isScalar()) {
10239     if (CheckCost > VectorizeMemoryCheckThreshold) {
10240       LLVM_DEBUG(
10241           dbgs()
10242           << "LV: Interleaving only is not profitable due to runtime checks\n");
10243       return false;
10244     }
10245     return true;
10246   }
10247 
10248   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10249   uint64_t ScalarC = *VF.ScalarCost.getValue();
10250   if (ScalarC == 0)
10251     return true;
10252 
10253   // First, compute the minimum iteration count required so that the vector
10254   // loop outperforms the scalar loop.
10255   //  The total cost of the scalar loop is
10256   //   ScalarC * TC
10257   //  where
10258   //  * TC is the actual trip count of the loop.
10259   //  * ScalarC is the cost of a single scalar iteration.
10260   //
10261   //  The total cost of the vector loop is
10262   //    RtC + VecC * (TC / VF) + EpiC
10263   //  where
10264   //  * RtC is the cost of the generated runtime checks
10265   //  * VecC is the cost of a single vector iteration.
10266   //  * TC is the actual trip count of the loop
10267   //  * VF is the vectorization factor
10268   //  * EpiCost is the cost of the generated epilogue, including the cost
10269   //    of the remaining scalar operations.
10270   //
10271   // Vectorization is profitable once the total vector cost is less than the
10272   // total scalar cost:
10273   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10274   //
10275   // Now we can compute the minimum required trip count TC as
10276   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10277   //
10278   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10279   // the computations are performed on doubles, not integers and the result
10280   // is rounded up, hence we get an upper estimate of the TC.
10281   unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10282   uint64_t RtC = *CheckCost.getValue();
10283   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10284   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10285 
10286   // Second, compute a minimum iteration count so that the cost of the
10287   // runtime checks is only a fraction of the total scalar loop cost. This
10288   // adds a loop-dependent bound on the overhead incurred if the runtime
10289   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10290   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10291   // cost, compute
10292   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10293   uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10294 
10295   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10296   // epilogue is allowed, choose the next closest multiple of VF. This should
10297   // partly compensate for ignoring the epilogue cost.
10298   uint64_t MinTC = std::max(MinTC1, MinTC2);
10299   if (SEL == CM_ScalarEpilogueAllowed)
10300     MinTC = alignTo(MinTC, IntVF);
10301   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
10302 
10303   LLVM_DEBUG(
10304       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10305              << VF.MinProfitableTripCount << "\n");
10306 
10307   // Skip vectorization if the expected trip count is less than the minimum
10308   // required trip count.
10309   if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10310     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10311                                 VF.MinProfitableTripCount)) {
10312       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10313                            "trip count < minimum profitable VF ("
10314                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10315                         << ")\n");
10316 
10317       return false;
10318     }
10319   }
10320   return true;
10321 }
10322 
10323 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10324     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10325                                !EnableLoopInterleaving),
10326       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10327                               !EnableLoopVectorization) {}
10328 
10329 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10330 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10331 /// don't have a corresponding wide induction in \p EpiPlan.
10332 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10333   // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10334   // will need their resume-values computed in the main vector loop. Others
10335   // can be removed from the main VPlan.
10336   SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10337   for (VPRecipeBase &R :
10338        EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
10339     if (isa<VPCanonicalIVPHIRecipe>(&R))
10340       continue;
10341     EpiWidenedPhis.insert(
10342         cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10343   }
10344   for (VPRecipeBase &R : make_early_inc_range(
10345            *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10346     auto *VPIRInst = cast<VPIRInstruction>(&R);
10347     auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10348     if (!IRI)
10349       break;
10350     if (EpiWidenedPhis.contains(IRI))
10351       continue;
10352     // There is no corresponding wide induction in the epilogue plan that would
10353     // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10354     // together with the corresponding ResumePhi. The resume values for the
10355     // scalar loop will be created during execution of EpiPlan.
10356     VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10357     VPIRInst->eraseFromParent();
10358     ResumePhi->eraseFromParent();
10359   }
10360   VPlanTransforms::removeDeadRecipes(MainPlan);
10361 
10362   using namespace VPlanPatternMatch;
10363   VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10364   VPValue *VectorTC = &MainPlan.getVectorTripCount();
10365   // If there is a suitable resume value for the canonical induction in the
10366   // scalar (which will become vector) epilogue loop we are done. Otherwise
10367   // create it below.
10368   if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10369         return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10370                              m_Specific(VectorTC), m_SpecificInt(0)));
10371       }))
10372     return;
10373   VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10374   ScalarPHBuilder.createNaryOp(
10375       VPInstruction::ResumePhi,
10376       {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10377       "vec.epilog.resume.val");
10378 }
10379 
10380 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10381 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10382 static void
10383 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
10384                                  const SCEV2ValueTy &ExpandedSCEVs,
10385                                  const EpilogueLoopVectorizationInfo &EPI) {
10386   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10387   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10388   Header->setName("vec.epilog.vector.body");
10389 
10390   // Re-use the trip count and steps expanded for the main loop, as
10391   // skeleton creation needs it as a value that dominates both the scalar
10392   // and vector epilogue loops
10393   // TODO: This is a workaround needed for epilogue vectorization and it
10394   // should be removed once induction resume value creation is done
10395   // directly in VPlan.
10396   for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10397     auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10398     if (!ExpandR)
10399       continue;
10400     auto *ExpandedVal =
10401         Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10402     ExpandR->replaceAllUsesWith(ExpandedVal);
10403     if (Plan.getTripCount() == ExpandR)
10404       Plan.resetTripCount(ExpandedVal);
10405     ExpandR->eraseFromParent();
10406   }
10407 
10408   // Ensure that the start values for all header phi recipes are updated before
10409   // vectorizing the epilogue loop.
10410   for (VPRecipeBase &R : Header->phis()) {
10411     if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10412       // When vectorizing the epilogue loop, the canonical induction start
10413       // value needs to be changed from zero to the value after the main
10414       // vector loop. Find the resume value created during execution of the main
10415       // VPlan.
10416       // FIXME: Improve modeling for canonical IV start values in the epilogue
10417       // loop.
10418       BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10419           predecessors(L->getLoopPreheader()),
10420           [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10421             if (BB != EPI.MainLoopIterationCountCheck &&
10422                 BB != EPI.EpilogueIterationCountCheck &&
10423                 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10424               return BB;
10425             return nullptr;
10426           });
10427       using namespace llvm::PatternMatch;
10428       Type *IdxTy = IV->getScalarType();
10429       PHINode *EPResumeVal = find_singleton<PHINode>(
10430           L->getLoopPreheader()->phis(),
10431           [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10432             if (P.getType() == IdxTy &&
10433                 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10434                 match(
10435                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10436                     m_SpecificInt(0)))
10437               return &P;
10438             return nullptr;
10439           });
10440       assert(EPResumeVal && "must have a resume value for the canonical IV");
10441       VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10442       assert(all_of(IV->users(),
10443                     [](const VPUser *U) {
10444                       return isa<VPScalarIVStepsRecipe>(U) ||
10445                              isa<VPScalarCastRecipe>(U) ||
10446                              isa<VPDerivedIVRecipe>(U) ||
10447                              cast<VPInstruction>(U)->getOpcode() ==
10448                                  Instruction::Add;
10449                     }) &&
10450              "the canonical IV should only be used by its increment or "
10451              "ScalarIVSteps when resetting the start value");
10452       IV->setOperand(0, VPV);
10453       continue;
10454     }
10455 
10456     Value *ResumeV = nullptr;
10457     // TODO: Move setting of resume values to prepareToExecute.
10458     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10459       ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10460                     ->getIncomingValueForBlock(L->getLoopPreheader());
10461       const RecurrenceDescriptor &RdxDesc =
10462           ReductionPhi->getRecurrenceDescriptor();
10463       RecurKind RK = RdxDesc.getRecurrenceKind();
10464       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
10465         // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10466         // start value; compare the final value from the main vector loop
10467         // to the start value.
10468         IRBuilder<> Builder(
10469             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10470         ResumeV =
10471             Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10472       } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
10473         // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10474         // to the resume value. The resume value is adjusted to the sentinel
10475         // value when the final value from the main vector loop equals the start
10476         // value. This ensures correctness when the start value might not be
10477         // less than the minimum value of a monotonically increasing induction
10478         // variable.
10479         IRBuilder<> Builder(
10480             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10481         Value *Cmp =
10482             Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10483         ResumeV =
10484             Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10485       }
10486     } else {
10487       // Retrieve the induction resume values for wide inductions from
10488       // their original phi nodes in the scalar loop.
10489       PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10490       // Hook up to the PHINode generated by a ResumePhi recipe of main
10491       // loop VPlan, which feeds the scalar loop.
10492       ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10493     }
10494     assert(ResumeV && "Must have a resume value");
10495     VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10496     cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10497   }
10498 }
10499 
10500 bool LoopVectorizePass::processLoop(Loop *L) {
10501   assert((EnableVPlanNativePath || L->isInnermost()) &&
10502          "VPlan-native path is not enabled. Only process inner loops.");
10503 
10504   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10505                     << L->getHeader()->getParent()->getName() << "' from "
10506                     << L->getLocStr() << "\n");
10507 
10508   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10509 
10510   LLVM_DEBUG(
10511       dbgs() << "LV: Loop hints:"
10512              << " force="
10513              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10514                      ? "disabled"
10515                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10516                             ? "enabled"
10517                             : "?"))
10518              << " width=" << Hints.getWidth()
10519              << " interleave=" << Hints.getInterleave() << "\n");
10520 
10521   // Function containing loop
10522   Function *F = L->getHeader()->getParent();
10523 
10524   // Looking at the diagnostic output is the only way to determine if a loop
10525   // was vectorized (other than looking at the IR or machine code), so it
10526   // is important to generate an optimization remark for each loop. Most of
10527   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10528   // generated as OptimizationRemark and OptimizationRemarkMissed are
10529   // less verbose reporting vectorized loops and unvectorized loops that may
10530   // benefit from vectorization, respectively.
10531 
10532   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10533     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10534     return false;
10535   }
10536 
10537   PredicatedScalarEvolution PSE(*SE, *L);
10538 
10539   // Check if it is legal to vectorize the loop.
10540   LoopVectorizationRequirements Requirements;
10541   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10542                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10543   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10544     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10545     Hints.emitRemarkWithHints();
10546     return false;
10547   }
10548 
10549   if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10550     reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10551                                "early exit is not enabled",
10552                                "UncountableEarlyExitLoopsDisabled", ORE, L);
10553     return false;
10554   }
10555 
10556   if (LVL.hasStructVectorCall()) {
10557     reportVectorizationFailure("Auto-vectorization of calls that return struct "
10558                                "types is not yet supported",
10559                                "StructCallVectorizationUnsupported", ORE, L);
10560     return false;
10561   }
10562 
10563   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10564   // here. They may require CFG and instruction level transformations before
10565   // even evaluating whether vectorization is profitable. Since we cannot modify
10566   // the incoming IR, we need to build VPlan upfront in the vectorization
10567   // pipeline.
10568   if (!L->isInnermost())
10569     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10570                                         ORE, BFI, PSI, Hints, Requirements);
10571 
10572   assert(L->isInnermost() && "Inner loop expected.");
10573 
10574   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10575   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10576 
10577   // If an override option has been passed in for interleaved accesses, use it.
10578   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10579     UseInterleaved = EnableInterleavedMemAccesses;
10580 
10581   // Analyze interleaved memory accesses.
10582   if (UseInterleaved)
10583     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10584 
10585   if (LVL.hasUncountableEarlyExit()) {
10586     BasicBlock *LoopLatch = L->getLoopLatch();
10587     if (IAI.requiresScalarEpilogue() ||
10588         any_of(LVL.getCountableExitingBlocks(),
10589                [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10590       reportVectorizationFailure("Auto-vectorization of early exit loops "
10591                                  "requiring a scalar epilogue is unsupported",
10592                                  "UncountableEarlyExitUnsupported", ORE, L);
10593       return false;
10594     }
10595   }
10596 
10597   // Check the function attributes and profiles to find out if this function
10598   // should be optimized for size.
10599   ScalarEpilogueLowering SEL =
10600       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10601 
10602   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10603   // count by optimizing for size, to minimize overheads.
10604   auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10605   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10606     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10607                       << "This loop is worth vectorizing only if no scalar "
10608                       << "iteration overheads are incurred.");
10609     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10610       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10611     else {
10612       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10613         LLVM_DEBUG(dbgs() << "\n");
10614         // Predicate tail-folded loops are efficient even when the loop
10615         // iteration count is low. However, setting the epilogue policy to
10616         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10617         // with runtime checks. It's more effective to let
10618         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10619         // for the loop.
10620         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10621           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10622       } else {
10623         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10624                              "small to consider vectorizing.\n");
10625         reportVectorizationFailure(
10626             "The trip count is below the minial threshold value.",
10627             "loop trip count is too low, avoiding vectorization",
10628             "LowTripCount", ORE, L);
10629         Hints.emitRemarkWithHints();
10630         return false;
10631       }
10632     }
10633   }
10634 
10635   // Check the function attributes to see if implicit floats or vectors are
10636   // allowed.
10637   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10638     reportVectorizationFailure(
10639         "Can't vectorize when the NoImplicitFloat attribute is used",
10640         "loop not vectorized due to NoImplicitFloat attribute",
10641         "NoImplicitFloat", ORE, L);
10642     Hints.emitRemarkWithHints();
10643     return false;
10644   }
10645 
10646   // Check if the target supports potentially unsafe FP vectorization.
10647   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10648   // for the target we're vectorizing for, to make sure none of the
10649   // additional fp-math flags can help.
10650   if (Hints.isPotentiallyUnsafe() &&
10651       TTI->isFPVectorizationPotentiallyUnsafe()) {
10652     reportVectorizationFailure(
10653         "Potentially unsafe FP op prevents vectorization",
10654         "loop not vectorized due to unsafe FP support.",
10655         "UnsafeFP", ORE, L);
10656     Hints.emitRemarkWithHints();
10657     return false;
10658   }
10659 
10660   bool AllowOrderedReductions;
10661   // If the flag is set, use that instead and override the TTI behaviour.
10662   if (ForceOrderedReductions.getNumOccurrences() > 0)
10663     AllowOrderedReductions = ForceOrderedReductions;
10664   else
10665     AllowOrderedReductions = TTI->enableOrderedReductions();
10666   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10667     ORE->emit([&]() {
10668       auto *ExactFPMathInst = Requirements.getExactFPInst();
10669       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10670                                                  ExactFPMathInst->getDebugLoc(),
10671                                                  ExactFPMathInst->getParent())
10672              << "loop not vectorized: cannot prove it is safe to reorder "
10673                 "floating-point operations";
10674     });
10675     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10676                          "reorder floating-point operations\n");
10677     Hints.emitRemarkWithHints();
10678     return false;
10679   }
10680 
10681   // Use the cost model.
10682   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10683                                 F, &Hints, IAI);
10684   // Use the planner for vectorization.
10685   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10686                                ORE);
10687 
10688   // Get user vectorization factor and interleave count.
10689   ElementCount UserVF = Hints.getWidth();
10690   unsigned UserIC = Hints.getInterleave();
10691 
10692   // Plan how to best vectorize.
10693   LVP.plan(UserVF, UserIC);
10694   VectorizationFactor VF = LVP.computeBestVF();
10695   unsigned IC = 1;
10696 
10697   if (ORE->allowExtraAnalysis(LV_NAME))
10698     LVP.emitInvalidCostRemarks(ORE);
10699 
10700   bool AddBranchWeights =
10701       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10702   GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10703                            AddBranchWeights, CM.CostKind);
10704   if (LVP.hasPlanWithVF(VF.Width)) {
10705     // Select the interleave count.
10706     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10707 
10708     unsigned SelectedIC = std::max(IC, UserIC);
10709     //  Optimistically generate runtime checks if they are needed. Drop them if
10710     //  they turn out to not be profitable.
10711     if (VF.Width.isVector() || SelectedIC > 1)
10712       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10713 
10714     // Check if it is profitable to vectorize with runtime checks.
10715     bool ForceVectorization =
10716         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10717     if (!ForceVectorization &&
10718         !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10719       ORE->emit([&]() {
10720         return OptimizationRemarkAnalysisAliasing(
10721                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10722                    L->getHeader())
10723                << "loop not vectorized: cannot prove it is safe to reorder "
10724                   "memory operations";
10725       });
10726       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10727       Hints.emitRemarkWithHints();
10728       return false;
10729     }
10730   }
10731 
10732   // Identify the diagnostic messages that should be produced.
10733   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10734   bool VectorizeLoop = true, InterleaveLoop = true;
10735   if (VF.Width.isScalar()) {
10736     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10737     VecDiagMsg = std::make_pair(
10738         "VectorizationNotBeneficial",
10739         "the cost-model indicates that vectorization is not beneficial");
10740     VectorizeLoop = false;
10741   }
10742 
10743   if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10744     // Tell the user interleaving was avoided up-front, despite being explicitly
10745     // requested.
10746     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10747                          "interleaving should be avoided up front\n");
10748     IntDiagMsg = std::make_pair(
10749         "InterleavingAvoided",
10750         "Ignoring UserIC, because interleaving was avoided up front");
10751     InterleaveLoop = false;
10752   } else if (IC == 1 && UserIC <= 1) {
10753     // Tell the user interleaving is not beneficial.
10754     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10755     IntDiagMsg = std::make_pair(
10756         "InterleavingNotBeneficial",
10757         "the cost-model indicates that interleaving is not beneficial");
10758     InterleaveLoop = false;
10759     if (UserIC == 1) {
10760       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10761       IntDiagMsg.second +=
10762           " and is explicitly disabled or interleave count is set to 1";
10763     }
10764   } else if (IC > 1 && UserIC == 1) {
10765     // Tell the user interleaving is beneficial, but it explicitly disabled.
10766     LLVM_DEBUG(
10767         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10768     IntDiagMsg = std::make_pair(
10769         "InterleavingBeneficialButDisabled",
10770         "the cost-model indicates that interleaving is beneficial "
10771         "but is explicitly disabled or interleave count is set to 1");
10772     InterleaveLoop = false;
10773   }
10774 
10775   // If there is a histogram in the loop, do not just interleave without
10776   // vectorizing. The order of operations will be incorrect without the
10777   // histogram intrinsics, which are only used for recipes with VF > 1.
10778   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10779     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10780                       << "to histogram operations.\n");
10781     IntDiagMsg = std::make_pair(
10782         "HistogramPreventsScalarInterleaving",
10783         "Unable to interleave without vectorization due to constraints on "
10784         "the order of histogram operations");
10785     InterleaveLoop = false;
10786   }
10787 
10788   // Override IC if user provided an interleave count.
10789   IC = UserIC > 0 ? UserIC : IC;
10790 
10791   // Emit diagnostic messages, if any.
10792   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10793   if (!VectorizeLoop && !InterleaveLoop) {
10794     // Do not vectorize or interleaving the loop.
10795     ORE->emit([&]() {
10796       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10797                                       L->getStartLoc(), L->getHeader())
10798              << VecDiagMsg.second;
10799     });
10800     ORE->emit([&]() {
10801       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10802                                       L->getStartLoc(), L->getHeader())
10803              << IntDiagMsg.second;
10804     });
10805     return false;
10806   }
10807 
10808   if (!VectorizeLoop && InterleaveLoop) {
10809     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10810     ORE->emit([&]() {
10811       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10812                                         L->getStartLoc(), L->getHeader())
10813              << VecDiagMsg.second;
10814     });
10815   } else if (VectorizeLoop && !InterleaveLoop) {
10816     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10817                       << ") in " << L->getLocStr() << '\n');
10818     ORE->emit([&]() {
10819       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10820                                         L->getStartLoc(), L->getHeader())
10821              << IntDiagMsg.second;
10822     });
10823   } else if (VectorizeLoop && InterleaveLoop) {
10824     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10825                       << ") in " << L->getLocStr() << '\n');
10826     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10827   }
10828 
10829   bool DisableRuntimeUnroll = false;
10830   MDNode *OrigLoopID = L->getLoopID();
10831   {
10832     using namespace ore;
10833     if (!VectorizeLoop) {
10834       assert(IC > 1 && "interleave count should not be 1 or 0");
10835       // If we decided that it is not legal to vectorize the loop, then
10836       // interleave it.
10837       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10838       InnerLoopVectorizer Unroller(
10839           L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10840           ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10841 
10842       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10843 
10844       ORE->emit([&]() {
10845         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10846                                   L->getHeader())
10847                << "interleaved loop (interleaved count: "
10848                << NV("InterleaveCount", IC) << ")";
10849       });
10850     } else {
10851       // If we decided that it is *legal* to vectorize the loop, then do it.
10852 
10853       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10854       // Consider vectorizing the epilogue too if it's profitable.
10855       VectorizationFactor EpilogueVF =
10856           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10857       if (EpilogueVF.Width.isVector()) {
10858         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10859 
10860         // The first pass vectorizes the main loop and creates a scalar epilogue
10861         // to be vectorized by executing the plan (potentially with a different
10862         // factor) again shortly afterwards.
10863         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10864         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10865         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10866                                           BestEpiPlan);
10867         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10868                                            EPI, &LVL, &CM, BFI, PSI, Checks,
10869                                            *BestMainPlan);
10870         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10871                                              *BestMainPlan, MainILV, DT, false);
10872         ++LoopsVectorized;
10873 
10874         // Second pass vectorizes the epilogue and adjusts the control flow
10875         // edges from the first pass.
10876         EPI.MainLoopVF = EPI.EpilogueVF;
10877         EPI.MainLoopUF = EPI.EpilogueUF;
10878         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10879                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10880                                                  Checks, BestEpiPlan);
10881         EpilogILV.setTripCount(MainILV.getTripCount());
10882         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10883 
10884         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10885                         DT, true, &ExpandedSCEVs);
10886         ++LoopsEpilogueVectorized;
10887 
10888         if (!MainILV.areSafetyChecksAdded())
10889           DisableRuntimeUnroll = true;
10890       } else {
10891         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10892                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10893                                PSI, Checks, BestPlan);
10894         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10895         ++LoopsVectorized;
10896 
10897         // Add metadata to disable runtime unrolling a scalar loop when there
10898         // are no runtime checks about strides and memory. A scalar loop that is
10899         // rarely used is not worth unrolling.
10900         if (!LB.areSafetyChecksAdded())
10901           DisableRuntimeUnroll = true;
10902       }
10903       // Report the vectorization decision.
10904       reportVectorization(ORE, L, VF, IC);
10905     }
10906 
10907     if (ORE->allowExtraAnalysis(LV_NAME))
10908       checkMixedPrecision(L, ORE);
10909   }
10910 
10911   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10912          "DT not preserved correctly");
10913 
10914   std::optional<MDNode *> RemainderLoopID =
10915       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10916                                       LLVMLoopVectorizeFollowupEpilogue});
10917   if (RemainderLoopID) {
10918     L->setLoopID(*RemainderLoopID);
10919   } else {
10920     if (DisableRuntimeUnroll)
10921       addRuntimeUnrollDisableMetaData(L);
10922 
10923     // Mark the loop as already vectorized to avoid vectorizing again.
10924     Hints.setAlreadyVectorized();
10925   }
10926 
10927   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10928   return true;
10929 }
10930 
10931 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10932 
10933   // Don't attempt if
10934   // 1. the target claims to have no vector registers, and
10935   // 2. interleaving won't help ILP.
10936   //
10937   // The second condition is necessary because, even if the target has no
10938   // vector registers, loop vectorization may still enable scalar
10939   // interleaving.
10940   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10941       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10942     return LoopVectorizeResult(false, false);
10943 
10944   bool Changed = false, CFGChanged = false;
10945 
10946   // The vectorizer requires loops to be in simplified form.
10947   // Since simplification may add new inner loops, it has to run before the
10948   // legality and profitability checks. This means running the loop vectorizer
10949   // will simplify all loops, regardless of whether anything end up being
10950   // vectorized.
10951   for (const auto &L : *LI)
10952     Changed |= CFGChanged |=
10953         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10954 
10955   // Build up a worklist of inner-loops to vectorize. This is necessary as
10956   // the act of vectorizing or partially unrolling a loop creates new loops
10957   // and can invalidate iterators across the loops.
10958   SmallVector<Loop *, 8> Worklist;
10959 
10960   for (Loop *L : *LI)
10961     collectSupportedLoops(*L, LI, ORE, Worklist);
10962 
10963   LoopsAnalyzed += Worklist.size();
10964 
10965   // Now walk the identified inner loops.
10966   while (!Worklist.empty()) {
10967     Loop *L = Worklist.pop_back_val();
10968 
10969     // For the inner loops we actually process, form LCSSA to simplify the
10970     // transform.
10971     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10972 
10973     Changed |= CFGChanged |= processLoop(L);
10974 
10975     if (Changed) {
10976       LAIs->clear();
10977 
10978 #ifndef NDEBUG
10979       if (VerifySCEV)
10980         SE->verify();
10981 #endif
10982     }
10983   }
10984 
10985   // Process each loop nest in the function.
10986   return LoopVectorizeResult(Changed, CFGChanged);
10987 }
10988 
10989 PreservedAnalyses LoopVectorizePass::run(Function &F,
10990                                          FunctionAnalysisManager &AM) {
10991   LI = &AM.getResult<LoopAnalysis>(F);
10992   // There are no loops in the function. Return before computing other
10993   // expensive analyses.
10994   if (LI->empty())
10995     return PreservedAnalyses::all();
10996   SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
10997   TTI = &AM.getResult<TargetIRAnalysis>(F);
10998   DT = &AM.getResult<DominatorTreeAnalysis>(F);
10999   TLI = &AM.getResult<TargetLibraryAnalysis>(F);
11000   AC = &AM.getResult<AssumptionAnalysis>(F);
11001   DB = &AM.getResult<DemandedBitsAnalysis>(F);
11002   ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
11003   LAIs = &AM.getResult<LoopAccessAnalysis>(F);
11004 
11005   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
11006   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
11007   BFI = nullptr;
11008   if (PSI && PSI->hasProfileSummary())
11009     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
11010   LoopVectorizeResult Result = runImpl(F);
11011   if (!Result.MadeAnyChange)
11012     return PreservedAnalyses::all();
11013   PreservedAnalyses PA;
11014 
11015   if (isAssignmentTrackingEnabled(*F.getParent())) {
11016     for (auto &BB : F)
11017       RemoveRedundantDbgInstrs(&BB);
11018   }
11019 
11020   PA.preserve<LoopAnalysis>();
11021   PA.preserve<DominatorTreeAnalysis>();
11022   PA.preserve<ScalarEvolutionAnalysis>();
11023   PA.preserve<LoopAccessAnalysis>();
11024 
11025   if (Result.MadeCFGChange) {
11026     // Making CFG changes likely means a loop got vectorized. Indicate that
11027     // extra simplification passes should be run.
11028     // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
11029     // be run if runtime checks have been added.
11030     AM.getResult<ShouldRunExtraVectorPasses>(F);
11031     PA.preserve<ShouldRunExtraVectorPasses>();
11032   } else {
11033     PA.preserveSet<CFGAnalyses>();
11034   }
11035   return PA;
11036 }
11037 
11038 void LoopVectorizePass::printPipeline(
11039     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
11040   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
11041       OS, MapClassName2PassName);
11042 
11043   OS << '<';
11044   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
11045   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
11046   OS << '>';
11047 }
11048