xref: /llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 9720be95d63ce797437015d0f0edd10b02e80b7a)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanPatternMatch.h"
63 #include "VPlanTransforms.h"
64 #include "VPlanUtils.h"
65 #include "VPlanVerifier.h"
66 #include "llvm/ADT/APInt.h"
67 #include "llvm/ADT/ArrayRef.h"
68 #include "llvm/ADT/DenseMap.h"
69 #include "llvm/ADT/DenseMapInfo.h"
70 #include "llvm/ADT/Hashing.h"
71 #include "llvm/ADT/MapVector.h"
72 #include "llvm/ADT/STLExtras.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/TypeSwitch.h"
79 #include "llvm/ADT/iterator_range.h"
80 #include "llvm/Analysis/AssumptionCache.h"
81 #include "llvm/Analysis/BasicAliasAnalysis.h"
82 #include "llvm/Analysis/BlockFrequencyInfo.h"
83 #include "llvm/Analysis/CFG.h"
84 #include "llvm/Analysis/CodeMetrics.h"
85 #include "llvm/Analysis/DemandedBits.h"
86 #include "llvm/Analysis/GlobalsModRef.h"
87 #include "llvm/Analysis/LoopAccessAnalysis.h"
88 #include "llvm/Analysis/LoopAnalysisManager.h"
89 #include "llvm/Analysis/LoopInfo.h"
90 #include "llvm/Analysis/LoopIterator.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/ValueTracking.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfo.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/MDBuilder.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/PatternMatch.h"
122 #include "llvm/IR/ProfDataUtils.h"
123 #include "llvm/IR/Type.h"
124 #include "llvm/IR/Use.h"
125 #include "llvm/IR/User.h"
126 #include "llvm/IR/Value.h"
127 #include "llvm/IR/Verifier.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/NativeFormatting.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/Local.h"
139 #include "llvm/Transforms/Utils/LoopSimplify.h"
140 #include "llvm/Transforms/Utils/LoopUtils.h"
141 #include "llvm/Transforms/Utils/LoopVersioning.h"
142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143 #include "llvm/Transforms/Utils/SizeOpts.h"
144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <algorithm>
146 #include <cassert>
147 #include <cstdint>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251                    "data-and-control-without-rt-check",
252                    "Similar to data-and-control, but remove the runtime check"),
253         clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254                    "Use predicated EVL instructions for tail folding. If EVL "
255                    "is unsupported, fallback to data-without-lane-mask.")));
256 
257 static cl::opt<bool> MaximizeBandwidth(
258     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259     cl::desc("Maximize bandwidth when selecting vectorization factor which "
260              "will be determined by the smallest type in loop."));
261 
262 static cl::opt<bool> EnableInterleavedMemAccesses(
263     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265 
266 /// An interleave-group may need masking if it resides in a block that needs
267 /// predication, or in order to mask away gaps.
268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
269     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271 
272 static cl::opt<unsigned> ForceTargetNumScalarRegs(
273     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's number of scalar registers."));
275 
276 static cl::opt<unsigned> ForceTargetNumVectorRegs(
277     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278     cl::desc("A flag that overrides the target's number of vector registers."));
279 
280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
281     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282     cl::desc("A flag that overrides the target's max interleave factor for "
283              "scalar loops."));
284 
285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
286     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287     cl::desc("A flag that overrides the target's max interleave factor for "
288              "vectorized loops."));
289 
290 cl::opt<unsigned> ForceTargetInstructionCost(
291     "force-target-instruction-cost", cl::init(0), cl::Hidden,
292     cl::desc("A flag that overrides the target's expected cost for "
293              "an instruction to a single constant value. Mostly "
294              "useful for getting consistent testing."));
295 
296 static cl::opt<bool> ForceTargetSupportsScalableVectors(
297     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298     cl::desc(
299         "Pretend that scalable vectors are supported, even if the target does "
300         "not support them. This flag should only be used for testing."));
301 
302 static cl::opt<unsigned> SmallLoopCost(
303     "small-loop-cost", cl::init(20), cl::Hidden,
304     cl::desc(
305         "The cost of a loop that is considered 'small' by the interleaver."));
306 
307 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
308     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309     cl::desc("Enable the use of the block frequency analysis to access PGO "
310              "heuristics minimizing code growth in cold regions and being more "
311              "aggressive in hot regions."));
312 
313 // Runtime interleave loops for load/store throughput.
314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
315     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316     cl::desc(
317         "Enable runtime interleaving until load/store ports are saturated"));
318 
319 /// The number of stores in a loop that are allowed to need predication.
320 static cl::opt<unsigned> NumberOfStoresToPredicate(
321     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322     cl::desc("Max number of stores to be predicated behind an if."));
323 
324 static cl::opt<bool> EnableIndVarRegisterHeur(
325     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326     cl::desc("Count the induction variable only once when interleaving"));
327 
328 static cl::opt<bool> EnableCondStoresVectorization(
329     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330     cl::desc("Enable if predication of stores during vectorization."));
331 
332 static cl::opt<unsigned> MaxNestedScalarReductionIC(
333     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334     cl::desc("The maximum interleave count to use when interleaving a scalar "
335              "reduction in a nested loop."));
336 
337 static cl::opt<bool>
338     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
339                            cl::Hidden,
340                            cl::desc("Prefer in-loop vector reductions, "
341                                     "overriding the targets preference."));
342 
343 static cl::opt<bool> ForceOrderedReductions(
344     "force-ordered-reductions", cl::init(false), cl::Hidden,
345     cl::desc("Enable the vectorisation of loops with in-order (strict) "
346              "FP reductions"));
347 
348 static cl::opt<bool> PreferPredicatedReductionSelect(
349     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350     cl::desc(
351         "Prefer predicating a reduction operation over an after loop select."));
352 
353 namespace llvm {
354 cl::opt<bool> EnableVPlanNativePath(
355     "enable-vplan-native-path", cl::Hidden,
356     cl::desc("Enable VPlan-native vectorization path with "
357              "support for outer loop vectorization."));
358 } // namespace llvm
359 
360 // This flag enables the stress testing of the VPlan H-CFG construction in the
361 // VPlan-native vectorization path. It must be used in conjuction with
362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363 // verification of the H-CFGs built.
364 static cl::opt<bool> VPlanBuildStressTest(
365     "vplan-build-stress-test", cl::init(false), cl::Hidden,
366     cl::desc(
367         "Build VPlan for every supported loop nest in the function and bail "
368         "out right after the build (stress test the VPlan H-CFG construction "
369         "in the VPlan-native vectorization path)."));
370 
371 cl::opt<bool> llvm::EnableLoopInterleaving(
372     "interleave-loops", cl::init(true), cl::Hidden,
373     cl::desc("Enable loop interleaving in Loop vectorization passes"));
374 cl::opt<bool> llvm::EnableLoopVectorization(
375     "vectorize-loops", cl::init(true), cl::Hidden,
376     cl::desc("Run the Loop vectorization passes"));
377 
378 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
379     "force-widen-divrem-via-safe-divisor", cl::Hidden,
380     cl::desc(
381         "Override cost based safe divisor widening for div/rem instructions"));
382 
383 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
384     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
385     cl::Hidden,
386     cl::desc("Try wider VFs if they enable the use of vector variants"));
387 
388 static cl::opt<bool> EnableEarlyExitVectorization(
389     "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390     cl::desc(
391         "Enable vectorization of early exit loops with uncountable exits."));
392 
393 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394 // variables not overflowing do not hold. See `emitSCEVChecks`.
395 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
397 // `emitMemRuntimeChecks`.
398 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399 // Likelyhood of bypassing the vectorized loop because there are zero trips left
400 // after prolog. See `emitIterationCountCheck`.
401 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402 
403 /// A helper function that returns true if the given type is irregular. The
404 /// type is irregular if its allocated size doesn't equal the store size of an
405 /// element of the corresponding vector type.
406 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407   // Determine if an array of N elements of type Ty is "bitcast compatible"
408   // with a <N x Ty> vector.
409   // This is only true if there is no padding between the array elements.
410   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411 }
412 
413 /// Returns "best known" trip count for the specified loop \p L as defined by
414 /// the following procedure:
415 ///   1) Returns exact trip count if it is known.
416 ///   2) Returns expected trip count according to profile data if any.
417 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
418 ///   4) Returns std::nullopt if all of the above failed.
419 static std::optional<unsigned>
420 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
421                     bool CanUseConstantMax = true) {
422   // Check if exact trip count is known.
423   if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
424     return ExpectedTC;
425 
426   // Check if there is an expected trip count available from profile data.
427   if (LoopVectorizeWithBlockFrequency)
428     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
429       return *EstimatedTC;
430 
431   if (!CanUseConstantMax)
432     return std::nullopt;
433 
434   // Check if upper bound estimate is known.
435   if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
436     return ExpectedTC;
437 
438   return std::nullopt;
439 }
440 
441 namespace {
442 // Forward declare GeneratedRTChecks.
443 class GeneratedRTChecks;
444 
445 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446 } // namespace
447 
448 namespace llvm {
449 
450 AnalysisKey ShouldRunExtraVectorPasses::Key;
451 
452 /// InnerLoopVectorizer vectorizes loops which contain only one basic
453 /// block to a specified vectorization factor (VF).
454 /// This class performs the widening of scalars into vectors, or multiple
455 /// scalars. This class also implements the following features:
456 /// * It inserts an epilogue loop for handling loops that don't have iteration
457 ///   counts that are known to be a multiple of the vectorization factor.
458 /// * It handles the code generation for reduction variables.
459 /// * Scalarization (implementation using scalars) of un-vectorizable
460 ///   instructions.
461 /// InnerLoopVectorizer does not perform any vectorization-legality
462 /// checks, and relies on the caller to check for the different legality
463 /// aspects. The InnerLoopVectorizer relies on the
464 /// LoopVectorizationLegality class to provide information about the induction
465 /// and reduction variables that were found to a given vectorization factor.
466 class InnerLoopVectorizer {
467 public:
468   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
469                       LoopInfo *LI, DominatorTree *DT,
470                       const TargetLibraryInfo *TLI,
471                       const TargetTransformInfo *TTI, AssumptionCache *AC,
472                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
473                       ElementCount MinProfitableTripCount,
474                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
475                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
476                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
477                       VPlan &Plan)
478       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
479         AC(AC), ORE(ORE), VF(VecWidth),
480         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
481         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
482         PSI(PSI), RTChecks(RTChecks), Plan(Plan),
483         VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
484     // Query this against the original loop and save it here because the profile
485     // of the original loop header may change as the transformation happens.
486     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
487         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
488   }
489 
490   virtual ~InnerLoopVectorizer() = default;
491 
492   /// Create a new empty loop that will contain vectorized instructions later
493   /// on, while the old loop will be used as the scalar remainder. Control flow
494   /// is generated around the vectorized (and scalar epilogue) loops consisting
495   /// of various checks and bypasses. Return the pre-header block of the new
496   /// loop. In the case of epilogue vectorization, this function is overriden to
497   /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
498   /// used to look up SCEV expansions for expressions needed during skeleton
499   /// creation.
500   virtual BasicBlock *
501   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502 
503   /// Fix the vectorized code, taking care of header phi's, and more.
504   void fixVectorizedLoop(VPTransformState &State);
505 
506   // Return true if any runtime check is added.
507   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
508 
509   /// A helper function to scalarize a single Instruction in the innermost loop.
510   /// Generates a sequence of scalar instances for each lane between \p MinLane
511   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
512   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
513   /// Instr's operands.
514   void scalarizeInstruction(const Instruction *Instr,
515                             VPReplicateRecipe *RepRecipe, const VPLane &Lane,
516                             VPTransformState &State);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPTransformState &State);
520 
521   /// Returns the original loop trip count.
522   Value *getTripCount() const { return TripCount; }
523 
524   /// Used to set the trip count after ILV's construction and after the
525   /// preheader block has been executed. Note that this always holds the trip
526   /// count of the original loop for both main loop and epilogue vectorization.
527   void setTripCount(Value *TC) { TripCount = TC; }
528 
529   // Retrieve the additional bypass value associated with an original
530   /// induction header phi.
531   Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
532     return Induction2AdditionalBypassValue.at(OrigPhi);
533   }
534 
535   /// Return the additional bypass block which targets the scalar loop by
536   /// skipping the epilogue loop after completing the main loop.
537   BasicBlock *getAdditionalBypassBlock() const {
538     assert(AdditionalBypassBlock &&
539            "Trying to access AdditionalBypassBlock but it has not been set");
540     return AdditionalBypassBlock;
541   }
542 
543 protected:
544   friend class LoopVectorizationPlanner;
545 
546   /// Set up the values of the IVs correctly when exiting the vector loop.
547   virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
548                             Value *VectorTripCount, BasicBlock *MiddleBlock,
549                             VPTransformState &State);
550 
551   /// Iteratively sink the scalarized operands of a predicated instruction into
552   /// the block that was created for it.
553   void sinkScalarOperands(Instruction *PredInst);
554 
555   /// Returns (and creates if needed) the trip count of the widened loop.
556   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
557 
558   /// Emit a bypass check to see if the vector trip count is zero, including if
559   /// it overflows.
560   void emitIterationCountCheck(BasicBlock *Bypass);
561 
562   /// Emit a bypass check to see if all of the SCEV assumptions we've
563   /// had to make are correct. Returns the block containing the checks or
564   /// nullptr if no checks have been added.
565   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
566 
567   /// Emit bypass checks to check any memory assumptions we may have made.
568   /// Returns the block containing the checks or nullptr if no checks have been
569   /// added.
570   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
571 
572   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
573   /// vector loop preheader, middle block and scalar preheader.
574   void createVectorLoopSkeleton(StringRef Prefix);
575 
576   /// Create and record the values for induction variables to resume coming from
577   /// the additional bypass block.
578   void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
579                                              Value *MainVectorTripCount);
580 
581   /// Allow subclasses to override and print debug traces before/after vplan
582   /// execution, when trace information is requested.
583   virtual void printDebugTracesAtStart() {}
584   virtual void printDebugTracesAtEnd() {}
585 
586   /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
587   /// vector preheader and its predecessor, also connecting the new block to the
588   /// scalar preheader.
589   void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
590 
591   /// The original loop.
592   Loop *OrigLoop;
593 
594   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
595   /// dynamic knowledge to simplify SCEV expressions and converts them to a
596   /// more usable form.
597   PredicatedScalarEvolution &PSE;
598 
599   /// Loop Info.
600   LoopInfo *LI;
601 
602   /// Dominator Tree.
603   DominatorTree *DT;
604 
605   /// Target Library Info.
606   const TargetLibraryInfo *TLI;
607 
608   /// Target Transform Info.
609   const TargetTransformInfo *TTI;
610 
611   /// Assumption Cache.
612   AssumptionCache *AC;
613 
614   /// Interface to emit optimization remarks.
615   OptimizationRemarkEmitter *ORE;
616 
617   /// The vectorization SIMD factor to use. Each vector will have this many
618   /// vector elements.
619   ElementCount VF;
620 
621   ElementCount MinProfitableTripCount;
622 
623   /// The vectorization unroll factor to use. Each scalar is vectorized to this
624   /// many different vector instructions.
625   unsigned UF;
626 
627   /// The builder that we use
628   IRBuilder<> Builder;
629 
630   // --- Vectorization state ---
631 
632   /// The vector-loop preheader.
633   BasicBlock *LoopVectorPreHeader;
634 
635   /// The scalar-loop preheader.
636   BasicBlock *LoopScalarPreHeader;
637 
638   /// Middle Block between the vector and the scalar.
639   BasicBlock *LoopMiddleBlock;
640 
641   /// A list of all bypass blocks. The first block is the entry of the loop.
642   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
643 
644   /// Store instructions that were predicated.
645   SmallVector<Instruction *, 4> PredicatedInstructions;
646 
647   /// Trip count of the original loop.
648   Value *TripCount = nullptr;
649 
650   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
651   Value *VectorTripCount = nullptr;
652 
653   /// The legality analysis.
654   LoopVectorizationLegality *Legal;
655 
656   /// The profitablity analysis.
657   LoopVectorizationCostModel *Cost;
658 
659   // Record whether runtime checks are added.
660   bool AddedSafetyChecks = false;
661 
662   /// BFI and PSI are used to check for profile guided size optimizations.
663   BlockFrequencyInfo *BFI;
664   ProfileSummaryInfo *PSI;
665 
666   // Whether this loop should be optimized for size based on profile guided size
667   // optimizatios.
668   bool OptForSizeBasedOnProfile;
669 
670   /// Structure to hold information about generated runtime checks, responsible
671   /// for cleaning the checks, if vectorization turns out unprofitable.
672   GeneratedRTChecks &RTChecks;
673 
674   /// Mapping of induction phis to their additional bypass values. They
675   /// need to be added as operands to phi nodes in the scalar loop preheader
676   /// after the epilogue skeleton has been created.
677   DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
678 
679   /// The additional bypass block which conditionally skips over the epilogue
680   /// loop after executing the main loop. Needed to resume inductions and
681   /// reductions during epilogue vectorization.
682   BasicBlock *AdditionalBypassBlock = nullptr;
683 
684   VPlan &Plan;
685 
686   /// The vector preheader block of \p Plan, used as target for check blocks
687   /// introduced during skeleton creation.
688   VPBlockBase *VectorPHVPB;
689 };
690 
691 /// Encapsulate information regarding vectorization of a loop and its epilogue.
692 /// This information is meant to be updated and used across two stages of
693 /// epilogue vectorization.
694 struct EpilogueLoopVectorizationInfo {
695   ElementCount MainLoopVF = ElementCount::getFixed(0);
696   unsigned MainLoopUF = 0;
697   ElementCount EpilogueVF = ElementCount::getFixed(0);
698   unsigned EpilogueUF = 0;
699   BasicBlock *MainLoopIterationCountCheck = nullptr;
700   BasicBlock *EpilogueIterationCountCheck = nullptr;
701   BasicBlock *SCEVSafetyCheck = nullptr;
702   BasicBlock *MemSafetyCheck = nullptr;
703   Value *TripCount = nullptr;
704   Value *VectorTripCount = nullptr;
705   VPlan &EpiloguePlan;
706 
707   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
708                                 ElementCount EVF, unsigned EUF,
709                                 VPlan &EpiloguePlan)
710       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
711         EpiloguePlan(EpiloguePlan) {
712     assert(EUF == 1 &&
713            "A high UF for the epilogue loop is likely not beneficial.");
714   }
715 };
716 
717 /// An extension of the inner loop vectorizer that creates a skeleton for a
718 /// vectorized loop that has its epilogue (residual) also vectorized.
719 /// The idea is to run the vplan on a given loop twice, firstly to setup the
720 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
721 /// from the first step and vectorize the epilogue.  This is achieved by
722 /// deriving two concrete strategy classes from this base class and invoking
723 /// them in succession from the loop vectorizer planner.
724 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
725 public:
726   InnerLoopAndEpilogueVectorizer(
727       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
728       DominatorTree *DT, const TargetLibraryInfo *TLI,
729       const TargetTransformInfo *TTI, AssumptionCache *AC,
730       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
731       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
732       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
733       GeneratedRTChecks &Checks, VPlan &Plan)
734       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
735                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
736                             CM, BFI, PSI, Checks, Plan),
737         EPI(EPI) {}
738 
739   // Override this function to handle the more complex control flow around the
740   // three loops.
741   BasicBlock *
742   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
743     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
744   }
745 
746   /// The interface for creating a vectorized skeleton using one of two
747   /// different strategies, each corresponding to one execution of the vplan
748   /// as described above.
749   virtual BasicBlock *
750   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
751 
752   /// Holds and updates state information required to vectorize the main loop
753   /// and its epilogue in two separate passes. This setup helps us avoid
754   /// regenerating and recomputing runtime safety checks. It also helps us to
755   /// shorten the iteration-count-check path length for the cases where the
756   /// iteration count of the loop is so small that the main vector loop is
757   /// completely skipped.
758   EpilogueLoopVectorizationInfo &EPI;
759 };
760 
761 /// A specialized derived class of inner loop vectorizer that performs
762 /// vectorization of *main* loops in the process of vectorizing loops and their
763 /// epilogues.
764 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
765 public:
766   EpilogueVectorizerMainLoop(
767       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
768       DominatorTree *DT, const TargetLibraryInfo *TLI,
769       const TargetTransformInfo *TTI, AssumptionCache *AC,
770       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
771       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
772       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
773       GeneratedRTChecks &Check, VPlan &Plan)
774       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
775                                        EPI, LVL, CM, BFI, PSI, Check, Plan) {}
776   /// Implements the interface for creating a vectorized skeleton using the
777   /// *main loop* strategy (ie the first pass of vplan execution).
778   BasicBlock *
779   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
780 
781 protected:
782   /// Emits an iteration count bypass check once for the main loop (when \p
783   /// ForEpilogue is false) and once for the epilogue loop (when \p
784   /// ForEpilogue is true).
785   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
786   void printDebugTracesAtStart() override;
787   void printDebugTracesAtEnd() override;
788 
789   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
790                     Value *VectorTripCount, BasicBlock *MiddleBlock,
791                     VPTransformState &State) override {};
792 };
793 
794 // A specialized derived class of inner loop vectorizer that performs
795 // vectorization of *epilogue* loops in the process of vectorizing loops and
796 // their epilogues.
797 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
798 public:
799   EpilogueVectorizerEpilogueLoop(
800       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
801       DominatorTree *DT, const TargetLibraryInfo *TLI,
802       const TargetTransformInfo *TTI, AssumptionCache *AC,
803       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
804       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
805       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
806       GeneratedRTChecks &Checks, VPlan &Plan)
807       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
808                                        EPI, LVL, CM, BFI, PSI, Checks, Plan) {
809     TripCount = EPI.TripCount;
810   }
811   /// Implements the interface for creating a vectorized skeleton using the
812   /// *epilogue loop* strategy (ie the second pass of vplan execution).
813   BasicBlock *
814   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
815 
816 protected:
817   /// Emits an iteration count bypass check after the main vector loop has
818   /// finished to see if there are any iterations left to execute by either
819   /// the vector epilogue or the scalar epilogue.
820   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
821                                                       BasicBlock *Bypass,
822                                                       BasicBlock *Insert);
823   void printDebugTracesAtStart() override;
824   void printDebugTracesAtEnd() override;
825 };
826 } // end namespace llvm
827 
828 /// Look for a meaningful debug location on the instruction or its operands.
829 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
830   if (!I)
831     return DebugLoc();
832 
833   DebugLoc Empty;
834   if (I->getDebugLoc() != Empty)
835     return I->getDebugLoc();
836 
837   for (Use &Op : I->operands()) {
838     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
839       if (OpInst->getDebugLoc() != Empty)
840         return OpInst->getDebugLoc();
841   }
842 
843   return I->getDebugLoc();
844 }
845 
846 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
847 /// is passed, the message relates to that particular instruction.
848 #ifndef NDEBUG
849 static void debugVectorizationMessage(const StringRef Prefix,
850                                       const StringRef DebugMsg,
851                                       Instruction *I) {
852   dbgs() << "LV: " << Prefix << DebugMsg;
853   if (I != nullptr)
854     dbgs() << " " << *I;
855   else
856     dbgs() << '.';
857   dbgs() << '\n';
858 }
859 #endif
860 
861 /// Create an analysis remark that explains why vectorization failed
862 ///
863 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
864 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
865 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
866 /// the location of the remark. If \p DL is passed, use it as debug location for
867 /// the remark. \return the remark object that can be streamed to.
868 static OptimizationRemarkAnalysis
869 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
870                  Instruction *I, DebugLoc DL = {}) {
871   Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
872   // If debug location is attached to the instruction, use it. Otherwise if DL
873   // was not provided, use the loop's.
874   if (I && I->getDebugLoc())
875     DL = I->getDebugLoc();
876   else if (!DL)
877     DL = TheLoop->getStartLoc();
878 
879   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
880 }
881 
882 namespace llvm {
883 
884 /// Return a value for Step multiplied by VF.
885 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
886                        int64_t Step) {
887   assert(Ty->isIntegerTy() && "Expected an integer step");
888   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
889 }
890 
891 /// Return the runtime value for VF.
892 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
893   return B.CreateElementCount(Ty, VF);
894 }
895 
896 void reportVectorizationFailure(const StringRef DebugMsg,
897                                 const StringRef OREMsg, const StringRef ORETag,
898                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
899                                 Instruction *I) {
900   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
901   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
902   ORE->emit(
903       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
904       << "loop not vectorized: " << OREMsg);
905 }
906 
907 /// Reports an informative message: print \p Msg for debugging purposes as well
908 /// as an optimization remark. Uses either \p I as location of the remark, or
909 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
910 /// remark. If \p DL is passed, use it as debug location for the remark.
911 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
912                                     OptimizationRemarkEmitter *ORE,
913                                     Loop *TheLoop, Instruction *I = nullptr,
914                                     DebugLoc DL = {}) {
915   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
916   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
917   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
918                              I, DL)
919             << Msg);
920 }
921 
922 /// Report successful vectorization of the loop. In case an outer loop is
923 /// vectorized, prepend "outer" to the vectorization remark.
924 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
925                                 VectorizationFactor VF, unsigned IC) {
926   LLVM_DEBUG(debugVectorizationMessage(
927       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
928       nullptr));
929   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
930   ORE->emit([&]() {
931     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
932                               TheLoop->getHeader())
933            << "vectorized " << LoopType << "loop (vectorization width: "
934            << ore::NV("VectorizationFactor", VF.Width)
935            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
936   });
937 }
938 
939 } // end namespace llvm
940 
941 namespace llvm {
942 
943 // Loop vectorization cost-model hints how the scalar epilogue loop should be
944 // lowered.
945 enum ScalarEpilogueLowering {
946 
947   // The default: allowing scalar epilogues.
948   CM_ScalarEpilogueAllowed,
949 
950   // Vectorization with OptForSize: don't allow epilogues.
951   CM_ScalarEpilogueNotAllowedOptSize,
952 
953   // A special case of vectorisation with OptForSize: loops with a very small
954   // trip count are considered for vectorization under OptForSize, thereby
955   // making sure the cost of their loop body is dominant, free of runtime
956   // guards and scalar iteration overheads.
957   CM_ScalarEpilogueNotAllowedLowTripLoop,
958 
959   // Loop hint predicate indicating an epilogue is undesired.
960   CM_ScalarEpilogueNotNeededUsePredicate,
961 
962   // Directive indicating we must either tail fold or not vectorize
963   CM_ScalarEpilogueNotAllowedUsePredicate
964 };
965 
966 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
967 
968 /// LoopVectorizationCostModel - estimates the expected speedups due to
969 /// vectorization.
970 /// In many cases vectorization is not profitable. This can happen because of
971 /// a number of reasons. In this class we mainly attempt to predict the
972 /// expected speedup/slowdowns due to the supported instruction set. We use the
973 /// TargetTransformInfo to query the different backends for the cost of
974 /// different operations.
975 class LoopVectorizationCostModel {
976   friend class LoopVectorizationPlanner;
977 
978 public:
979   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
980                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
981                              LoopVectorizationLegality *Legal,
982                              const TargetTransformInfo &TTI,
983                              const TargetLibraryInfo *TLI, DemandedBits *DB,
984                              AssumptionCache *AC,
985                              OptimizationRemarkEmitter *ORE, const Function *F,
986                              const LoopVectorizeHints *Hints,
987                              InterleavedAccessInfo &IAI)
988       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
989         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
990         Hints(Hints), InterleaveInfo(IAI) {}
991 
992   /// \return An upper bound for the vectorization factors (both fixed and
993   /// scalable). If the factors are 0, vectorization and interleaving should be
994   /// avoided up front.
995   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
996 
997   /// \return True if runtime checks are required for vectorization, and false
998   /// otherwise.
999   bool runtimeChecksRequired();
1000 
1001   /// Setup cost-based decisions for user vectorization factor.
1002   /// \return true if the UserVF is a feasible VF to be chosen.
1003   bool selectUserVectorizationFactor(ElementCount UserVF) {
1004     collectUniformsAndScalars(UserVF);
1005     collectInstsToScalarize(UserVF);
1006     return expectedCost(UserVF).isValid();
1007   }
1008 
1009   /// \return The size (in bits) of the smallest and widest types in the code
1010   /// that needs to be vectorized. We ignore values that remain scalar such as
1011   /// 64 bit loop indices.
1012   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1013 
1014   /// \return The desired interleave count.
1015   /// If interleave count has been specified by metadata it will be returned.
1016   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1017   /// are the selected vectorization factor and the cost of the selected VF.
1018   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1019 
1020   /// Memory access instruction may be vectorized in more than one way.
1021   /// Form of instruction after vectorization depends on cost.
1022   /// This function takes cost-based decisions for Load/Store instructions
1023   /// and collects them in a map. This decisions map is used for building
1024   /// the lists of loop-uniform and loop-scalar instructions.
1025   /// The calculated cost is saved with widening decision in order to
1026   /// avoid redundant calculations.
1027   void setCostBasedWideningDecision(ElementCount VF);
1028 
1029   /// A call may be vectorized in different ways depending on whether we have
1030   /// vectorized variants available and whether the target supports masking.
1031   /// This function analyzes all calls in the function at the supplied VF,
1032   /// makes a decision based on the costs of available options, and stores that
1033   /// decision in a map for use in planning and plan execution.
1034   void setVectorizedCallDecision(ElementCount VF);
1035 
1036   /// A struct that represents some properties of the register usage
1037   /// of a loop.
1038   struct RegisterUsage {
1039     /// Holds the number of loop invariant values that are used in the loop.
1040     /// The key is ClassID of target-provided register class.
1041     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1042     /// Holds the maximum number of concurrent live intervals in the loop.
1043     /// The key is ClassID of target-provided register class.
1044     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1045   };
1046 
1047   /// \return Returns information about the register usages of the loop for the
1048   /// given vectorization factors.
1049   SmallVector<RegisterUsage, 8>
1050   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1051 
1052   /// Collect values we want to ignore in the cost model.
1053   void collectValuesToIgnore();
1054 
1055   /// Collect all element types in the loop for which widening is needed.
1056   void collectElementTypesForWidening();
1057 
1058   /// Split reductions into those that happen in the loop, and those that happen
1059   /// outside. In loop reductions are collected into InLoopReductions.
1060   void collectInLoopReductions();
1061 
1062   /// Returns true if we should use strict in-order reductions for the given
1063   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1064   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1065   /// of FP operations.
1066   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1067     return !Hints->allowReordering() && RdxDesc.isOrdered();
1068   }
1069 
1070   /// \returns The smallest bitwidth each instruction can be represented with.
1071   /// The vector equivalents of these instructions should be truncated to this
1072   /// type.
1073   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1074     return MinBWs;
1075   }
1076 
1077   /// \returns True if it is more profitable to scalarize instruction \p I for
1078   /// vectorization factor \p VF.
1079   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1080     assert(VF.isVector() &&
1081            "Profitable to scalarize relevant only for VF > 1.");
1082     assert(
1083         TheLoop->isInnermost() &&
1084         "cost-model should not be used for outer loops (in VPlan-native path)");
1085 
1086     auto Scalars = InstsToScalarize.find(VF);
1087     assert(Scalars != InstsToScalarize.end() &&
1088            "VF not yet analyzed for scalarization profitability");
1089     return Scalars->second.contains(I);
1090   }
1091 
1092   /// Returns true if \p I is known to be uniform after vectorization.
1093   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1094     assert(
1095         TheLoop->isInnermost() &&
1096         "cost-model should not be used for outer loops (in VPlan-native path)");
1097     // Pseudo probe needs to be duplicated for each unrolled iteration and
1098     // vector lane so that profiled loop trip count can be accurately
1099     // accumulated instead of being under counted.
1100     if (isa<PseudoProbeInst>(I))
1101       return false;
1102 
1103     if (VF.isScalar())
1104       return true;
1105 
1106     auto UniformsPerVF = Uniforms.find(VF);
1107     assert(UniformsPerVF != Uniforms.end() &&
1108            "VF not yet analyzed for uniformity");
1109     return UniformsPerVF->second.count(I);
1110   }
1111 
1112   /// Returns true if \p I is known to be scalar after vectorization.
1113   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1114     assert(
1115         TheLoop->isInnermost() &&
1116         "cost-model should not be used for outer loops (in VPlan-native path)");
1117     if (VF.isScalar())
1118       return true;
1119 
1120     auto ScalarsPerVF = Scalars.find(VF);
1121     assert(ScalarsPerVF != Scalars.end() &&
1122            "Scalar values are not calculated for VF");
1123     return ScalarsPerVF->second.count(I);
1124   }
1125 
1126   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1127   /// for vectorization factor \p VF.
1128   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1129     return VF.isVector() && MinBWs.contains(I) &&
1130            !isProfitableToScalarize(I, VF) &&
1131            !isScalarAfterVectorization(I, VF);
1132   }
1133 
1134   /// Decision that was taken during cost calculation for memory instruction.
1135   enum InstWidening {
1136     CM_Unknown,
1137     CM_Widen,         // For consecutive accesses with stride +1.
1138     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1139     CM_Interleave,
1140     CM_GatherScatter,
1141     CM_Scalarize,
1142     CM_VectorCall,
1143     CM_IntrinsicCall
1144   };
1145 
1146   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1147   /// instruction \p I and vector width \p VF.
1148   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1149                            InstructionCost Cost) {
1150     assert(VF.isVector() && "Expected VF >=2");
1151     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1152   }
1153 
1154   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1155   /// interleaving group \p Grp and vector width \p VF.
1156   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1157                            ElementCount VF, InstWidening W,
1158                            InstructionCost Cost) {
1159     assert(VF.isVector() && "Expected VF >=2");
1160     /// Broadcast this decicion to all instructions inside the group.
1161     /// When interleaving, the cost will only be assigned one instruction, the
1162     /// insert position. For other cases, add the appropriate fraction of the
1163     /// total cost to each instruction. This ensures accurate costs are used,
1164     /// even if the insert position instruction is not used.
1165     InstructionCost InsertPosCost = Cost;
1166     InstructionCost OtherMemberCost = 0;
1167     if (W != CM_Interleave)
1168       OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1169     ;
1170     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1171       if (auto *I = Grp->getMember(Idx)) {
1172         if (Grp->getInsertPos() == I)
1173           WideningDecisions[std::make_pair(I, VF)] =
1174               std::make_pair(W, InsertPosCost);
1175         else
1176           WideningDecisions[std::make_pair(I, VF)] =
1177               std::make_pair(W, OtherMemberCost);
1178       }
1179     }
1180   }
1181 
1182   /// Return the cost model decision for the given instruction \p I and vector
1183   /// width \p VF. Return CM_Unknown if this instruction did not pass
1184   /// through the cost modeling.
1185   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1186     assert(VF.isVector() && "Expected VF to be a vector VF");
1187     assert(
1188         TheLoop->isInnermost() &&
1189         "cost-model should not be used for outer loops (in VPlan-native path)");
1190 
1191     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1192     auto Itr = WideningDecisions.find(InstOnVF);
1193     if (Itr == WideningDecisions.end())
1194       return CM_Unknown;
1195     return Itr->second.first;
1196   }
1197 
1198   /// Return the vectorization cost for the given instruction \p I and vector
1199   /// width \p VF.
1200   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1201     assert(VF.isVector() && "Expected VF >=2");
1202     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1203     assert(WideningDecisions.contains(InstOnVF) &&
1204            "The cost is not calculated");
1205     return WideningDecisions[InstOnVF].second;
1206   }
1207 
1208   struct CallWideningDecision {
1209     InstWidening Kind;
1210     Function *Variant;
1211     Intrinsic::ID IID;
1212     std::optional<unsigned> MaskPos;
1213     InstructionCost Cost;
1214   };
1215 
1216   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1217                                Function *Variant, Intrinsic::ID IID,
1218                                std::optional<unsigned> MaskPos,
1219                                InstructionCost Cost) {
1220     assert(!VF.isScalar() && "Expected vector VF");
1221     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1222                                                      MaskPos, Cost};
1223   }
1224 
1225   CallWideningDecision getCallWideningDecision(CallInst *CI,
1226                                                ElementCount VF) const {
1227     assert(!VF.isScalar() && "Expected vector VF");
1228     return CallWideningDecisions.at(std::make_pair(CI, VF));
1229   }
1230 
1231   /// Return True if instruction \p I is an optimizable truncate whose operand
1232   /// is an induction variable. Such a truncate will be removed by adding a new
1233   /// induction variable with the destination type.
1234   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1235     // If the instruction is not a truncate, return false.
1236     auto *Trunc = dyn_cast<TruncInst>(I);
1237     if (!Trunc)
1238       return false;
1239 
1240     // Get the source and destination types of the truncate.
1241     Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1242     Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1243 
1244     // If the truncate is free for the given types, return false. Replacing a
1245     // free truncate with an induction variable would add an induction variable
1246     // update instruction to each iteration of the loop. We exclude from this
1247     // check the primary induction variable since it will need an update
1248     // instruction regardless.
1249     Value *Op = Trunc->getOperand(0);
1250     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1251       return false;
1252 
1253     // If the truncated value is not an induction variable, return false.
1254     return Legal->isInductionPhi(Op);
1255   }
1256 
1257   /// Collects the instructions to scalarize for each predicated instruction in
1258   /// the loop.
1259   void collectInstsToScalarize(ElementCount VF);
1260 
1261   /// Collect Uniform and Scalar values for the given \p VF.
1262   /// The sets depend on CM decision for Load/Store instructions
1263   /// that may be vectorized as interleave, gather-scatter or scalarized.
1264   /// Also make a decision on what to do about call instructions in the loop
1265   /// at that VF -- scalarize, call a known vector routine, or call a
1266   /// vector intrinsic.
1267   void collectUniformsAndScalars(ElementCount VF) {
1268     // Do the analysis once.
1269     if (VF.isScalar() || Uniforms.contains(VF))
1270       return;
1271     setCostBasedWideningDecision(VF);
1272     collectLoopUniforms(VF);
1273     setVectorizedCallDecision(VF);
1274     collectLoopScalars(VF);
1275   }
1276 
1277   /// Returns true if the target machine supports masked store operation
1278   /// for the given \p DataType and kind of access to \p Ptr.
1279   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1280     return Legal->isConsecutivePtr(DataType, Ptr) &&
1281            TTI.isLegalMaskedStore(DataType, Alignment);
1282   }
1283 
1284   /// Returns true if the target machine supports masked load operation
1285   /// for the given \p DataType and kind of access to \p Ptr.
1286   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1287     return Legal->isConsecutivePtr(DataType, Ptr) &&
1288            TTI.isLegalMaskedLoad(DataType, Alignment);
1289   }
1290 
1291   /// Returns true if the target machine can represent \p V as a masked gather
1292   /// or scatter operation.
1293   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1294     bool LI = isa<LoadInst>(V);
1295     bool SI = isa<StoreInst>(V);
1296     if (!LI && !SI)
1297       return false;
1298     auto *Ty = getLoadStoreType(V);
1299     Align Align = getLoadStoreAlignment(V);
1300     if (VF.isVector())
1301       Ty = VectorType::get(Ty, VF);
1302     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1303            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1304   }
1305 
1306   /// Returns true if the target machine supports all of the reduction
1307   /// variables found for the given VF.
1308   bool canVectorizeReductions(ElementCount VF) const {
1309     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1310       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1311       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1312     }));
1313   }
1314 
1315   /// Given costs for both strategies, return true if the scalar predication
1316   /// lowering should be used for div/rem.  This incorporates an override
1317   /// option so it is not simply a cost comparison.
1318   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1319                                      InstructionCost SafeDivisorCost) const {
1320     switch (ForceSafeDivisor) {
1321     case cl::BOU_UNSET:
1322       return ScalarCost < SafeDivisorCost;
1323     case cl::BOU_TRUE:
1324       return false;
1325     case cl::BOU_FALSE:
1326       return true;
1327     }
1328     llvm_unreachable("impossible case value");
1329   }
1330 
1331   /// Returns true if \p I is an instruction which requires predication and
1332   /// for which our chosen predication strategy is scalarization (i.e. we
1333   /// don't have an alternate strategy such as masking available).
1334   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1335   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1336 
1337   /// Returns true if \p I is an instruction that needs to be predicated
1338   /// at runtime.  The result is independent of the predication mechanism.
1339   /// Superset of instructions that return true for isScalarWithPredication.
1340   bool isPredicatedInst(Instruction *I) const;
1341 
1342   /// Return the costs for our two available strategies for lowering a
1343   /// div/rem operation which requires speculating at least one lane.
1344   /// First result is for scalarization (will be invalid for scalable
1345   /// vectors); second is for the safe-divisor strategy.
1346   std::pair<InstructionCost, InstructionCost>
1347   getDivRemSpeculationCost(Instruction *I,
1348                            ElementCount VF) const;
1349 
1350   /// Returns true if \p I is a memory instruction with consecutive memory
1351   /// access that can be widened.
1352   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1353 
1354   /// Returns true if \p I is a memory instruction in an interleaved-group
1355   /// of memory accesses that can be vectorized with wide vector loads/stores
1356   /// and shuffles.
1357   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1358 
1359   /// Check if \p Instr belongs to any interleaved access group.
1360   bool isAccessInterleaved(Instruction *Instr) const {
1361     return InterleaveInfo.isInterleaved(Instr);
1362   }
1363 
1364   /// Get the interleaved access group that \p Instr belongs to.
1365   const InterleaveGroup<Instruction> *
1366   getInterleavedAccessGroup(Instruction *Instr) const {
1367     return InterleaveInfo.getInterleaveGroup(Instr);
1368   }
1369 
1370   /// Returns true if we're required to use a scalar epilogue for at least
1371   /// the final iteration of the original loop.
1372   bool requiresScalarEpilogue(bool IsVectorizing) const {
1373     if (!isScalarEpilogueAllowed()) {
1374       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1375       return false;
1376     }
1377     // If we might exit from anywhere but the latch and early exit vectorization
1378     // is disabled, we must run the exiting iteration in scalar form.
1379     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1380         !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1381       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1382                            "from latch block\n");
1383       return true;
1384     }
1385     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1386       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1387                            "interleaved group requires scalar epilogue\n");
1388       return true;
1389     }
1390     LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1391     return false;
1392   }
1393 
1394   /// Returns true if we're required to use a scalar epilogue for at least
1395   /// the final iteration of the original loop for all VFs in \p Range.
1396   /// A scalar epilogue must either be required for all VFs in \p Range or for
1397   /// none.
1398   bool requiresScalarEpilogue(VFRange Range) const {
1399     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1400       return requiresScalarEpilogue(VF.isVector());
1401     };
1402     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1403     assert(
1404         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1405         "all VFs in range must agree on whether a scalar epilogue is required");
1406     return IsRequired;
1407   }
1408 
1409   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1410   /// loop hint annotation.
1411   bool isScalarEpilogueAllowed() const {
1412     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1413   }
1414 
1415   /// Returns the TailFoldingStyle that is best for the current loop.
1416   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1417     if (!ChosenTailFoldingStyle)
1418       return TailFoldingStyle::None;
1419     return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1420                                : ChosenTailFoldingStyle->second;
1421   }
1422 
1423   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1424   /// overflow or not.
1425   /// \param IsScalableVF true if scalable vector factors enabled.
1426   /// \param UserIC User specific interleave count.
1427   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1428     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1429     if (!Legal->canFoldTailByMasking()) {
1430       ChosenTailFoldingStyle =
1431           std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
1432       return;
1433     }
1434 
1435     if (!ForceTailFoldingStyle.getNumOccurrences()) {
1436       ChosenTailFoldingStyle = std::make_pair(
1437           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1438           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1439       return;
1440     }
1441 
1442     // Set styles when forced.
1443     ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1444                                             ForceTailFoldingStyle.getValue());
1445     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1446       return;
1447     // Override forced styles if needed.
1448     // FIXME: use actual opcode/data type for analysis here.
1449     // FIXME: Investigate opportunity for fixed vector factor.
1450     bool EVLIsLegal =
1451         UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1452         !EnableVPlanNativePath &&
1453         // FIXME: remove this once fixed-ordered recurrence is supported.
1454         Legal->getFixedOrderRecurrences().empty();
1455     if (!EVLIsLegal) {
1456       // If for some reason EVL mode is unsupported, fallback to
1457       // DataWithoutLaneMask to try to vectorize the loop with folded tail
1458       // in a generic way.
1459       ChosenTailFoldingStyle =
1460           std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1461                          TailFoldingStyle::DataWithoutLaneMask);
1462       LLVM_DEBUG(
1463           dbgs()
1464           << "LV: Preference for VP intrinsics indicated. Will "
1465              "not try to generate VP Intrinsics "
1466           << (UserIC > 1
1467                   ? "since interleave count specified is greater than 1.\n"
1468                   : "due to non-interleaving reasons.\n"));
1469     }
1470   }
1471 
1472   /// Returns true if all loop blocks should be masked to fold tail loop.
1473   bool foldTailByMasking() const {
1474     // TODO: check if it is possible to check for None style independent of
1475     // IVUpdateMayOverflow flag in getTailFoldingStyle.
1476     return getTailFoldingStyle() != TailFoldingStyle::None;
1477   }
1478 
1479   /// Return maximum safe number of elements to be processed per vector
1480   /// iteration, which do not prevent store-load forwarding and are safe with
1481   /// regard to the memory dependencies. Required for EVL-based VPlans to
1482   /// correctly calculate AVL (application vector length) as min(remaining AVL,
1483   /// MaxSafeElements).
1484   /// TODO: need to consider adjusting cost model to use this value as a
1485   /// vectorization factor for EVL-based vectorization.
1486   std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1487 
1488   /// Returns true if the instructions in this block requires predication
1489   /// for any reason, e.g. because tail folding now requires a predicate
1490   /// or because the block in the original loop was predicated.
1491   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1492     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1493   }
1494 
1495   /// Returns true if VP intrinsics with explicit vector length support should
1496   /// be generated in the tail folded loop.
1497   bool foldTailWithEVL() const {
1498     return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1499   }
1500 
1501   /// Returns true if the Phi is part of an inloop reduction.
1502   bool isInLoopReduction(PHINode *Phi) const {
1503     return InLoopReductions.contains(Phi);
1504   }
1505 
1506   /// Returns true if the predicated reduction select should be used to set the
1507   /// incoming value for the reduction phi.
1508   bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1509     // Force to use predicated reduction select since the EVL of the
1510     // second-to-last iteration might not be VF*UF.
1511     if (foldTailWithEVL())
1512       return true;
1513     return PreferPredicatedReductionSelect ||
1514            TTI.preferPredicatedReductionSelect(
1515                Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1516   }
1517 
1518   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1519   /// with factor VF.  Return the cost of the instruction, including
1520   /// scalarization overhead if it's needed.
1521   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1522 
1523   /// Estimate cost of a call instruction CI if it were vectorized with factor
1524   /// VF. Return the cost of the instruction, including scalarization overhead
1525   /// if it's needed.
1526   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1527 
1528   /// Invalidates decisions already taken by the cost model.
1529   void invalidateCostModelingDecisions() {
1530     WideningDecisions.clear();
1531     CallWideningDecisions.clear();
1532     Uniforms.clear();
1533     Scalars.clear();
1534   }
1535 
1536   /// Returns the expected execution cost. The unit of the cost does
1537   /// not matter because we use the 'cost' units to compare different
1538   /// vector widths. The cost that is returned is *not* normalized by
1539   /// the factor width.
1540   InstructionCost expectedCost(ElementCount VF);
1541 
1542   bool hasPredStores() const { return NumPredStores > 0; }
1543 
1544   /// Returns true if epilogue vectorization is considered profitable, and
1545   /// false otherwise.
1546   /// \p VF is the vectorization factor chosen for the original loop.
1547   /// \p Multiplier is an aditional scaling factor applied to VF before
1548   /// comparing to EpilogueVectorizationMinVF.
1549   bool isEpilogueVectorizationProfitable(const ElementCount VF,
1550                                          const unsigned IC) const;
1551 
1552   /// Returns the execution time cost of an instruction for a given vector
1553   /// width. Vector width of one means scalar.
1554   InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1555 
1556   /// Return the cost of instructions in an inloop reduction pattern, if I is
1557   /// part of that pattern.
1558   std::optional<InstructionCost>
1559   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1560                           TTI::TargetCostKind CostKind) const;
1561 
1562   /// Returns true if \p Op should be considered invariant and if it is
1563   /// trivially hoistable.
1564   bool shouldConsiderInvariant(Value *Op);
1565 
1566 private:
1567   unsigned NumPredStores = 0;
1568 
1569   /// \return An upper bound for the vectorization factors for both
1570   /// fixed and scalable vectorization, where the minimum-known number of
1571   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1572   /// disabled or unsupported, then the scalable part will be equal to
1573   /// ElementCount::getScalable(0).
1574   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1575                                            ElementCount UserVF,
1576                                            bool FoldTailByMasking);
1577 
1578   /// \return the maximized element count based on the targets vector
1579   /// registers and the loop trip-count, but limited to a maximum safe VF.
1580   /// This is a helper function of computeFeasibleMaxVF.
1581   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1582                                        unsigned SmallestType,
1583                                        unsigned WidestType,
1584                                        ElementCount MaxSafeVF,
1585                                        bool FoldTailByMasking);
1586 
1587   /// Checks if scalable vectorization is supported and enabled. Caches the
1588   /// result to avoid repeated debug dumps for repeated queries.
1589   bool isScalableVectorizationAllowed();
1590 
1591   /// \return the maximum legal scalable VF, based on the safe max number
1592   /// of elements.
1593   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1594 
1595   /// Calculate vectorization cost of memory instruction \p I.
1596   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1597 
1598   /// The cost computation for scalarized memory instruction.
1599   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1600 
1601   /// The cost computation for interleaving group of memory instructions.
1602   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1603 
1604   /// The cost computation for Gather/Scatter instruction.
1605   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1606 
1607   /// The cost computation for widening instruction \p I with consecutive
1608   /// memory access.
1609   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1610 
1611   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1612   /// Load: scalar load + broadcast.
1613   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1614   /// element)
1615   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1616 
1617   /// Estimate the overhead of scalarizing an instruction. This is a
1618   /// convenience wrapper for the type-based getScalarizationOverhead API.
1619   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1620                                            TTI::TargetCostKind CostKind) const;
1621 
1622   /// Returns true if an artificially high cost for emulated masked memrefs
1623   /// should be used.
1624   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1625 
1626   /// Map of scalar integer values to the smallest bitwidth they can be legally
1627   /// represented as. The vector equivalents of these values should be truncated
1628   /// to this type.
1629   MapVector<Instruction *, uint64_t> MinBWs;
1630 
1631   /// A type representing the costs for instructions if they were to be
1632   /// scalarized rather than vectorized. The entries are Instruction-Cost
1633   /// pairs.
1634   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1635 
1636   /// A set containing all BasicBlocks that are known to present after
1637   /// vectorization as a predicated block.
1638   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1639       PredicatedBBsAfterVectorization;
1640 
1641   /// Records whether it is allowed to have the original scalar loop execute at
1642   /// least once. This may be needed as a fallback loop in case runtime
1643   /// aliasing/dependence checks fail, or to handle the tail/remainder
1644   /// iterations when the trip count is unknown or doesn't divide by the VF,
1645   /// or as a peel-loop to handle gaps in interleave-groups.
1646   /// Under optsize and when the trip count is very small we don't allow any
1647   /// iterations to execute in the scalar loop.
1648   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1649 
1650   /// Control finally chosen tail folding style. The first element is used if
1651   /// the IV update may overflow, the second element - if it does not.
1652   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1653       ChosenTailFoldingStyle;
1654 
1655   /// true if scalable vectorization is supported and enabled.
1656   std::optional<bool> IsScalableVectorizationAllowed;
1657 
1658   /// Maximum safe number of elements to be processed per vector iteration,
1659   /// which do not prevent store-load forwarding and are safe with regard to the
1660   /// memory dependencies. Required for EVL-based veectorization, where this
1661   /// value is used as the upper bound of the safe AVL.
1662   std::optional<unsigned> MaxSafeElements;
1663 
1664   /// A map holding scalar costs for different vectorization factors. The
1665   /// presence of a cost for an instruction in the mapping indicates that the
1666   /// instruction will be scalarized when vectorizing with the associated
1667   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1668   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1669 
1670   /// Holds the instructions known to be uniform after vectorization.
1671   /// The data is collected per VF.
1672   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1673 
1674   /// Holds the instructions known to be scalar after vectorization.
1675   /// The data is collected per VF.
1676   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1677 
1678   /// Holds the instructions (address computations) that are forced to be
1679   /// scalarized.
1680   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1681 
1682   /// PHINodes of the reductions that should be expanded in-loop.
1683   SmallPtrSet<PHINode *, 4> InLoopReductions;
1684 
1685   /// A Map of inloop reduction operations and their immediate chain operand.
1686   /// FIXME: This can be removed once reductions can be costed correctly in
1687   /// VPlan. This was added to allow quick lookup of the inloop operations.
1688   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1689 
1690   /// Returns the expected difference in cost from scalarizing the expression
1691   /// feeding a predicated instruction \p PredInst. The instructions to
1692   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1693   /// non-negative return value implies the expression will be scalarized.
1694   /// Currently, only single-use chains are considered for scalarization.
1695   InstructionCost computePredInstDiscount(Instruction *PredInst,
1696                                           ScalarCostsTy &ScalarCosts,
1697                                           ElementCount VF);
1698 
1699   /// Collect the instructions that are uniform after vectorization. An
1700   /// instruction is uniform if we represent it with a single scalar value in
1701   /// the vectorized loop corresponding to each vector iteration. Examples of
1702   /// uniform instructions include pointer operands of consecutive or
1703   /// interleaved memory accesses. Note that although uniformity implies an
1704   /// instruction will be scalar, the reverse is not true. In general, a
1705   /// scalarized instruction will be represented by VF scalar values in the
1706   /// vectorized loop, each corresponding to an iteration of the original
1707   /// scalar loop.
1708   void collectLoopUniforms(ElementCount VF);
1709 
1710   /// Collect the instructions that are scalar after vectorization. An
1711   /// instruction is scalar if it is known to be uniform or will be scalarized
1712   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1713   /// to the list if they are used by a load/store instruction that is marked as
1714   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1715   /// VF values in the vectorized loop, each corresponding to an iteration of
1716   /// the original scalar loop.
1717   void collectLoopScalars(ElementCount VF);
1718 
1719   /// Keeps cost model vectorization decision and cost for instructions.
1720   /// Right now it is used for memory instructions only.
1721   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1722                                 std::pair<InstWidening, InstructionCost>>;
1723 
1724   DecisionList WideningDecisions;
1725 
1726   using CallDecisionList =
1727       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1728 
1729   CallDecisionList CallWideningDecisions;
1730 
1731   /// Returns true if \p V is expected to be vectorized and it needs to be
1732   /// extracted.
1733   bool needsExtract(Value *V, ElementCount VF) const {
1734     Instruction *I = dyn_cast<Instruction>(V);
1735     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1736         TheLoop->isLoopInvariant(I) ||
1737         getWideningDecision(I, VF) == CM_Scalarize)
1738       return false;
1739 
1740     // Assume we can vectorize V (and hence we need extraction) if the
1741     // scalars are not computed yet. This can happen, because it is called
1742     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1743     // the scalars are collected. That should be a safe assumption in most
1744     // cases, because we check if the operands have vectorizable types
1745     // beforehand in LoopVectorizationLegality.
1746     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1747   };
1748 
1749   /// Returns a range containing only operands needing to be extracted.
1750   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1751                                                    ElementCount VF) const {
1752     return SmallVector<Value *, 4>(make_filter_range(
1753         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1754   }
1755 
1756 public:
1757   /// The loop that we evaluate.
1758   Loop *TheLoop;
1759 
1760   /// Predicated scalar evolution analysis.
1761   PredicatedScalarEvolution &PSE;
1762 
1763   /// Loop Info analysis.
1764   LoopInfo *LI;
1765 
1766   /// Vectorization legality.
1767   LoopVectorizationLegality *Legal;
1768 
1769   /// Vector target information.
1770   const TargetTransformInfo &TTI;
1771 
1772   /// Target Library Info.
1773   const TargetLibraryInfo *TLI;
1774 
1775   /// Demanded bits analysis.
1776   DemandedBits *DB;
1777 
1778   /// Assumption cache.
1779   AssumptionCache *AC;
1780 
1781   /// Interface to emit optimization remarks.
1782   OptimizationRemarkEmitter *ORE;
1783 
1784   const Function *TheFunction;
1785 
1786   /// Loop Vectorize Hint.
1787   const LoopVectorizeHints *Hints;
1788 
1789   /// The interleave access information contains groups of interleaved accesses
1790   /// with the same stride and close to each other.
1791   InterleavedAccessInfo &InterleaveInfo;
1792 
1793   /// Values to ignore in the cost model.
1794   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1795 
1796   /// Values to ignore in the cost model when VF > 1.
1797   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1798 
1799   /// All element types found in the loop.
1800   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1801 };
1802 } // end namespace llvm
1803 
1804 namespace {
1805 /// Helper struct to manage generating runtime checks for vectorization.
1806 ///
1807 /// The runtime checks are created up-front in temporary blocks to allow better
1808 /// estimating the cost and un-linked from the existing IR. After deciding to
1809 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1810 /// temporary blocks are completely removed.
1811 class GeneratedRTChecks {
1812   /// Basic block which contains the generated SCEV checks, if any.
1813   BasicBlock *SCEVCheckBlock = nullptr;
1814 
1815   /// The value representing the result of the generated SCEV checks. If it is
1816   /// nullptr, either no SCEV checks have been generated or they have been used.
1817   Value *SCEVCheckCond = nullptr;
1818 
1819   /// Basic block which contains the generated memory runtime checks, if any.
1820   BasicBlock *MemCheckBlock = nullptr;
1821 
1822   /// The value representing the result of the generated memory runtime checks.
1823   /// If it is nullptr, either no memory runtime checks have been generated or
1824   /// they have been used.
1825   Value *MemRuntimeCheckCond = nullptr;
1826 
1827   DominatorTree *DT;
1828   LoopInfo *LI;
1829   TargetTransformInfo *TTI;
1830 
1831   SCEVExpander SCEVExp;
1832   SCEVExpander MemCheckExp;
1833 
1834   bool CostTooHigh = false;
1835   const bool AddBranchWeights;
1836 
1837   Loop *OuterLoop = nullptr;
1838 
1839   PredicatedScalarEvolution &PSE;
1840 
1841 public:
1842   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1843                     LoopInfo *LI, TargetTransformInfo *TTI,
1844                     const DataLayout &DL, bool AddBranchWeights)
1845       : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1846         MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1847         AddBranchWeights(AddBranchWeights), PSE(PSE) {}
1848 
1849   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1850   /// accurately estimate the cost of the runtime checks. The blocks are
1851   /// un-linked from the IR and are added back during vector code generation. If
1852   /// there is no vector code generation, the check blocks are removed
1853   /// completely.
1854   void create(Loop *L, const LoopAccessInfo &LAI,
1855               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1856 
1857     // Hard cutoff to limit compile-time increase in case a very large number of
1858     // runtime checks needs to be generated.
1859     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1860     // profile info.
1861     CostTooHigh =
1862         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1863     if (CostTooHigh)
1864       return;
1865 
1866     BasicBlock *LoopHeader = L->getHeader();
1867     BasicBlock *Preheader = L->getLoopPreheader();
1868 
1869     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1870     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1871     // may be used by SCEVExpander. The blocks will be un-linked from their
1872     // predecessors and removed from LI & DT at the end of the function.
1873     if (!UnionPred.isAlwaysTrue()) {
1874       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1875                                   nullptr, "vector.scevcheck");
1876 
1877       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1878           &UnionPred, SCEVCheckBlock->getTerminator());
1879     }
1880 
1881     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1882     if (RtPtrChecking.Need) {
1883       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1884       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1885                                  "vector.memcheck");
1886 
1887       auto DiffChecks = RtPtrChecking.getDiffChecks();
1888       if (DiffChecks) {
1889         Value *RuntimeVF = nullptr;
1890         MemRuntimeCheckCond = addDiffRuntimeChecks(
1891             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1892             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1893               if (!RuntimeVF)
1894                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1895               return RuntimeVF;
1896             },
1897             IC);
1898       } else {
1899         MemRuntimeCheckCond = addRuntimeChecks(
1900             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1901             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
1902       }
1903       assert(MemRuntimeCheckCond &&
1904              "no RT checks generated although RtPtrChecking "
1905              "claimed checks are required");
1906     }
1907 
1908     if (!MemCheckBlock && !SCEVCheckBlock)
1909       return;
1910 
1911     // Unhook the temporary block with the checks, update various places
1912     // accordingly.
1913     if (SCEVCheckBlock)
1914       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1915     if (MemCheckBlock)
1916       MemCheckBlock->replaceAllUsesWith(Preheader);
1917 
1918     if (SCEVCheckBlock) {
1919       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1920       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1921       Preheader->getTerminator()->eraseFromParent();
1922     }
1923     if (MemCheckBlock) {
1924       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1925       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1926       Preheader->getTerminator()->eraseFromParent();
1927     }
1928 
1929     DT->changeImmediateDominator(LoopHeader, Preheader);
1930     if (MemCheckBlock) {
1931       DT->eraseNode(MemCheckBlock);
1932       LI->removeBlock(MemCheckBlock);
1933     }
1934     if (SCEVCheckBlock) {
1935       DT->eraseNode(SCEVCheckBlock);
1936       LI->removeBlock(SCEVCheckBlock);
1937     }
1938 
1939     // Outer loop is used as part of the later cost calculations.
1940     OuterLoop = L->getParentLoop();
1941   }
1942 
1943   InstructionCost getCost() {
1944     if (SCEVCheckBlock || MemCheckBlock)
1945       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1946 
1947     if (CostTooHigh) {
1948       InstructionCost Cost;
1949       Cost.setInvalid();
1950       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1951       return Cost;
1952     }
1953 
1954     InstructionCost RTCheckCost = 0;
1955     if (SCEVCheckBlock)
1956       for (Instruction &I : *SCEVCheckBlock) {
1957         if (SCEVCheckBlock->getTerminator() == &I)
1958           continue;
1959         InstructionCost C =
1960             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1961         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1962         RTCheckCost += C;
1963       }
1964     if (MemCheckBlock) {
1965       InstructionCost MemCheckCost = 0;
1966       for (Instruction &I : *MemCheckBlock) {
1967         if (MemCheckBlock->getTerminator() == &I)
1968           continue;
1969         InstructionCost C =
1970             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1971         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1972         MemCheckCost += C;
1973       }
1974 
1975       // If the runtime memory checks are being created inside an outer loop
1976       // we should find out if these checks are outer loop invariant. If so,
1977       // the checks will likely be hoisted out and so the effective cost will
1978       // reduce according to the outer loop trip count.
1979       if (OuterLoop) {
1980         ScalarEvolution *SE = MemCheckExp.getSE();
1981         // TODO: If profitable, we could refine this further by analysing every
1982         // individual memory check, since there could be a mixture of loop
1983         // variant and invariant checks that mean the final condition is
1984         // variant.
1985         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1986         if (SE->isLoopInvariant(Cond, OuterLoop)) {
1987           // It seems reasonable to assume that we can reduce the effective
1988           // cost of the checks even when we know nothing about the trip
1989           // count. Assume that the outer loop executes at least twice.
1990           unsigned BestTripCount = 2;
1991 
1992           // Get the best known TC estimate.
1993           if (auto EstimatedTC = getSmallBestKnownTC(
1994                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
1995             BestTripCount = *EstimatedTC;
1996 
1997           BestTripCount = std::max(BestTripCount, 1U);
1998           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1999 
2000           // Let's ensure the cost is always at least 1.
2001           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2002                                      (InstructionCost::CostType)1);
2003 
2004           if (BestTripCount > 1)
2005             LLVM_DEBUG(dbgs()
2006                        << "We expect runtime memory checks to be hoisted "
2007                        << "out of the outer loop. Cost reduced from "
2008                        << MemCheckCost << " to " << NewMemCheckCost << '\n');
2009 
2010           MemCheckCost = NewMemCheckCost;
2011         }
2012       }
2013 
2014       RTCheckCost += MemCheckCost;
2015     }
2016 
2017     if (SCEVCheckBlock || MemCheckBlock)
2018       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2019                         << "\n");
2020 
2021     return RTCheckCost;
2022   }
2023 
2024   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2025   /// unused.
2026   ~GeneratedRTChecks() {
2027     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2028     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2029     if (!SCEVCheckCond)
2030       SCEVCleaner.markResultUsed();
2031 
2032     if (!MemRuntimeCheckCond)
2033       MemCheckCleaner.markResultUsed();
2034 
2035     if (MemRuntimeCheckCond) {
2036       auto &SE = *MemCheckExp.getSE();
2037       // Memory runtime check generation creates compares that use expanded
2038       // values. Remove them before running the SCEVExpanderCleaners.
2039       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2040         if (MemCheckExp.isInsertedInstruction(&I))
2041           continue;
2042         SE.forgetValue(&I);
2043         I.eraseFromParent();
2044       }
2045     }
2046     MemCheckCleaner.cleanup();
2047     SCEVCleaner.cleanup();
2048 
2049     if (SCEVCheckCond)
2050       SCEVCheckBlock->eraseFromParent();
2051     if (MemRuntimeCheckCond)
2052       MemCheckBlock->eraseFromParent();
2053   }
2054 
2055   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2056   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2057   /// depending on the generated condition.
2058   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2059                              BasicBlock *LoopVectorPreHeader) {
2060     if (!SCEVCheckCond)
2061       return nullptr;
2062 
2063     Value *Cond = SCEVCheckCond;
2064     // Mark the check as used, to prevent it from being removed during cleanup.
2065     SCEVCheckCond = nullptr;
2066     if (auto *C = dyn_cast<ConstantInt>(Cond))
2067       if (C->isZero())
2068         return nullptr;
2069 
2070     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2071 
2072     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2073     // Create new preheader for vector loop.
2074     if (OuterLoop)
2075       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2076 
2077     SCEVCheckBlock->getTerminator()->eraseFromParent();
2078     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2079     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2080                                                 SCEVCheckBlock);
2081 
2082     DT->addNewBlock(SCEVCheckBlock, Pred);
2083     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2084 
2085     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2086     if (AddBranchWeights)
2087       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2088     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2089     return SCEVCheckBlock;
2090   }
2091 
2092   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2093   /// the branches to branch to the vector preheader or \p Bypass, depending on
2094   /// the generated condition.
2095   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2096                                    BasicBlock *LoopVectorPreHeader) {
2097     // Check if we generated code that checks in runtime if arrays overlap.
2098     if (!MemRuntimeCheckCond)
2099       return nullptr;
2100 
2101     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2102     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2103                                                 MemCheckBlock);
2104 
2105     DT->addNewBlock(MemCheckBlock, Pred);
2106     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2107     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2108 
2109     if (OuterLoop)
2110       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2111 
2112     BranchInst &BI =
2113         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2114     if (AddBranchWeights) {
2115       setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2116     }
2117     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2118     MemCheckBlock->getTerminator()->setDebugLoc(
2119         Pred->getTerminator()->getDebugLoc());
2120 
2121     // Mark the check as used, to prevent it from being removed during cleanup.
2122     MemRuntimeCheckCond = nullptr;
2123     return MemCheckBlock;
2124   }
2125 };
2126 } // namespace
2127 
2128 static bool useActiveLaneMask(TailFoldingStyle Style) {
2129   return Style == TailFoldingStyle::Data ||
2130          Style == TailFoldingStyle::DataAndControlFlow ||
2131          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2132 }
2133 
2134 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2135   return Style == TailFoldingStyle::DataAndControlFlow ||
2136          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2137 }
2138 
2139 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2140 // vectorization. The loop needs to be annotated with #pragma omp simd
2141 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2142 // vector length information is not provided, vectorization is not considered
2143 // explicit. Interleave hints are not allowed either. These limitations will be
2144 // relaxed in the future.
2145 // Please, note that we are currently forced to abuse the pragma 'clang
2146 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2147 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2148 // provides *explicit vectorization hints* (LV can bypass legal checks and
2149 // assume that vectorization is legal). However, both hints are implemented
2150 // using the same metadata (llvm.loop.vectorize, processed by
2151 // LoopVectorizeHints). This will be fixed in the future when the native IR
2152 // representation for pragma 'omp simd' is introduced.
2153 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2154                                    OptimizationRemarkEmitter *ORE) {
2155   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2156   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2157 
2158   // Only outer loops with an explicit vectorization hint are supported.
2159   // Unannotated outer loops are ignored.
2160   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2161     return false;
2162 
2163   Function *Fn = OuterLp->getHeader()->getParent();
2164   if (!Hints.allowVectorization(Fn, OuterLp,
2165                                 true /*VectorizeOnlyWhenForced*/)) {
2166     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2167     return false;
2168   }
2169 
2170   if (Hints.getInterleave() > 1) {
2171     // TODO: Interleave support is future work.
2172     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2173                          "outer loops.\n");
2174     Hints.emitRemarkWithHints();
2175     return false;
2176   }
2177 
2178   return true;
2179 }
2180 
2181 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2182                                   OptimizationRemarkEmitter *ORE,
2183                                   SmallVectorImpl<Loop *> &V) {
2184   // Collect inner loops and outer loops without irreducible control flow. For
2185   // now, only collect outer loops that have explicit vectorization hints. If we
2186   // are stress testing the VPlan H-CFG construction, we collect the outermost
2187   // loop of every loop nest.
2188   if (L.isInnermost() || VPlanBuildStressTest ||
2189       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2190     LoopBlocksRPO RPOT(&L);
2191     RPOT.perform(LI);
2192     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2193       V.push_back(&L);
2194       // TODO: Collect inner loops inside marked outer loops in case
2195       // vectorization fails for the outer loop. Do not invoke
2196       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2197       // already known to be reducible. We can use an inherited attribute for
2198       // that.
2199       return;
2200     }
2201   }
2202   for (Loop *InnerL : L)
2203     collectSupportedLoops(*InnerL, LI, ORE, V);
2204 }
2205 
2206 //===----------------------------------------------------------------------===//
2207 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2208 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2209 //===----------------------------------------------------------------------===//
2210 
2211 /// Compute the transformed value of Index at offset StartValue using step
2212 /// StepValue.
2213 /// For integer induction, returns StartValue + Index * StepValue.
2214 /// For pointer induction, returns StartValue[Index * StepValue].
2215 /// FIXME: The newly created binary instructions should contain nsw/nuw
2216 /// flags, which can be found from the original scalar operations.
2217 static Value *
2218 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2219                      Value *Step,
2220                      InductionDescriptor::InductionKind InductionKind,
2221                      const BinaryOperator *InductionBinOp) {
2222   Type *StepTy = Step->getType();
2223   Value *CastedIndex = StepTy->isIntegerTy()
2224                            ? B.CreateSExtOrTrunc(Index, StepTy)
2225                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2226   if (CastedIndex != Index) {
2227     CastedIndex->setName(CastedIndex->getName() + ".cast");
2228     Index = CastedIndex;
2229   }
2230 
2231   // Note: the IR at this point is broken. We cannot use SE to create any new
2232   // SCEV and then expand it, hoping that SCEV's simplification will give us
2233   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2234   // lead to various SCEV crashes. So all we can do is to use builder and rely
2235   // on InstCombine for future simplifications. Here we handle some trivial
2236   // cases only.
2237   auto CreateAdd = [&B](Value *X, Value *Y) {
2238     assert(X->getType() == Y->getType() && "Types don't match!");
2239     if (auto *CX = dyn_cast<ConstantInt>(X))
2240       if (CX->isZero())
2241         return Y;
2242     if (auto *CY = dyn_cast<ConstantInt>(Y))
2243       if (CY->isZero())
2244         return X;
2245     return B.CreateAdd(X, Y);
2246   };
2247 
2248   // We allow X to be a vector type, in which case Y will potentially be
2249   // splatted into a vector with the same element count.
2250   auto CreateMul = [&B](Value *X, Value *Y) {
2251     assert(X->getType()->getScalarType() == Y->getType() &&
2252            "Types don't match!");
2253     if (auto *CX = dyn_cast<ConstantInt>(X))
2254       if (CX->isOne())
2255         return Y;
2256     if (auto *CY = dyn_cast<ConstantInt>(Y))
2257       if (CY->isOne())
2258         return X;
2259     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2260     if (XVTy && !isa<VectorType>(Y->getType()))
2261       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2262     return B.CreateMul(X, Y);
2263   };
2264 
2265   switch (InductionKind) {
2266   case InductionDescriptor::IK_IntInduction: {
2267     assert(!isa<VectorType>(Index->getType()) &&
2268            "Vector indices not supported for integer inductions yet");
2269     assert(Index->getType() == StartValue->getType() &&
2270            "Index type does not match StartValue type");
2271     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2272       return B.CreateSub(StartValue, Index);
2273     auto *Offset = CreateMul(Index, Step);
2274     return CreateAdd(StartValue, Offset);
2275   }
2276   case InductionDescriptor::IK_PtrInduction:
2277     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2278   case InductionDescriptor::IK_FpInduction: {
2279     assert(!isa<VectorType>(Index->getType()) &&
2280            "Vector indices not supported for FP inductions yet");
2281     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2282     assert(InductionBinOp &&
2283            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2284             InductionBinOp->getOpcode() == Instruction::FSub) &&
2285            "Original bin op should be defined for FP induction");
2286 
2287     Value *MulExp = B.CreateFMul(Step, Index);
2288     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2289                          "induction");
2290   }
2291   case InductionDescriptor::IK_NoInduction:
2292     return nullptr;
2293   }
2294   llvm_unreachable("invalid enum");
2295 }
2296 
2297 std::optional<unsigned> getMaxVScale(const Function &F,
2298                                      const TargetTransformInfo &TTI) {
2299   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2300     return MaxVScale;
2301 
2302   if (F.hasFnAttribute(Attribute::VScaleRange))
2303     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2304 
2305   return std::nullopt;
2306 }
2307 
2308 /// For the given VF and UF and maximum trip count computed for the loop, return
2309 /// whether the induction variable might overflow in the vectorized loop. If not,
2310 /// then we know a runtime overflow check always evaluates to false and can be
2311 /// removed.
2312 static bool isIndvarOverflowCheckKnownFalse(
2313     const LoopVectorizationCostModel *Cost,
2314     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2315   // Always be conservative if we don't know the exact unroll factor.
2316   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2317 
2318   Type *IdxTy = Cost->Legal->getWidestInductionType();
2319   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2320 
2321   // We know the runtime overflow check is known false iff the (max) trip-count
2322   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2323   // the vector loop induction variable.
2324   if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2325     uint64_t MaxVF = VF.getKnownMinValue();
2326     if (VF.isScalable()) {
2327       std::optional<unsigned> MaxVScale =
2328           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2329       if (!MaxVScale)
2330         return false;
2331       MaxVF *= *MaxVScale;
2332     }
2333 
2334     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2335   }
2336 
2337   return false;
2338 }
2339 
2340 // Return whether we allow using masked interleave-groups (for dealing with
2341 // strided loads/stores that reside in predicated blocks, or for dealing
2342 // with gaps).
2343 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2344   // If an override option has been passed in for interleaved accesses, use it.
2345   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2346     return EnableMaskedInterleavedMemAccesses;
2347 
2348   return TTI.enableMaskedInterleavedAccessVectorization();
2349 }
2350 
2351 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2352                                                VPReplicateRecipe *RepRecipe,
2353                                                const VPLane &Lane,
2354                                                VPTransformState &State) {
2355   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2356 
2357   // Does this instruction return a value ?
2358   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2359 
2360   Instruction *Cloned = Instr->clone();
2361   if (!IsVoidRetTy) {
2362     Cloned->setName(Instr->getName() + ".cloned");
2363 #if !defined(NDEBUG)
2364     // Verify that VPlan type inference results agree with the type of the
2365     // generated values.
2366     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2367            "inferred type and type from generated instructions do not match");
2368 #endif
2369   }
2370 
2371   RepRecipe->setFlags(Cloned);
2372 
2373   if (auto DL = Instr->getDebugLoc())
2374     State.setDebugLocFrom(DL);
2375 
2376   // Replace the operands of the cloned instructions with their scalar
2377   // equivalents in the new loop.
2378   for (const auto &I : enumerate(RepRecipe->operands())) {
2379     auto InputLane = Lane;
2380     VPValue *Operand = I.value();
2381     if (vputils::isUniformAfterVectorization(Operand))
2382       InputLane = VPLane::getFirstLane();
2383     Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2384   }
2385   State.addNewMetadata(Cloned, Instr);
2386 
2387   // Place the cloned scalar in the new loop.
2388   State.Builder.Insert(Cloned);
2389 
2390   State.set(RepRecipe, Cloned, Lane);
2391 
2392   // If we just cloned a new assumption, add it the assumption cache.
2393   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2394     AC->registerAssumption(II);
2395 
2396   // End if-block.
2397   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2398   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2399   assert(
2400       (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2401        all_of(RepRecipe->operands(),
2402               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2403       "Expected a recipe is either within a region or all of its operands "
2404       "are defined outside the vectorized region.");
2405   if (IfPredicateInstr)
2406     PredicatedInstructions.push_back(Cloned);
2407 }
2408 
2409 Value *
2410 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2411   if (VectorTripCount)
2412     return VectorTripCount;
2413 
2414   Value *TC = getTripCount();
2415   IRBuilder<> Builder(InsertBlock->getTerminator());
2416 
2417   Type *Ty = TC->getType();
2418   // This is where we can make the step a runtime constant.
2419   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2420 
2421   // If the tail is to be folded by masking, round the number of iterations N
2422   // up to a multiple of Step instead of rounding down. This is done by first
2423   // adding Step-1 and then rounding down. Note that it's ok if this addition
2424   // overflows: the vector induction variable will eventually wrap to zero given
2425   // that it starts at zero and its Step is a power of two; the loop will then
2426   // exit, with the last early-exit vector comparison also producing all-true.
2427   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2428   // is accounted for in emitIterationCountCheck that adds an overflow check.
2429   if (Cost->foldTailByMasking()) {
2430     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2431            "VF*UF must be a power of 2 when folding tail by masking");
2432     TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2433                            "n.rnd.up");
2434   }
2435 
2436   // Now we need to generate the expression for the part of the loop that the
2437   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2438   // iterations are not required for correctness, or N - Step, otherwise. Step
2439   // is equal to the vectorization factor (number of SIMD elements) times the
2440   // unroll factor (number of SIMD instructions).
2441   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2442 
2443   // There are cases where we *must* run at least one iteration in the remainder
2444   // loop.  See the cost model for when this can happen.  If the step evenly
2445   // divides the trip count, we set the remainder to be equal to the step. If
2446   // the step does not evenly divide the trip count, no adjustment is necessary
2447   // since there will already be scalar iterations. Note that the minimum
2448   // iterations check ensures that N >= Step.
2449   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2450     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2451     R = Builder.CreateSelect(IsZero, Step, R);
2452   }
2453 
2454   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2455 
2456   return VectorTripCount;
2457 }
2458 
2459 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2460   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2461   VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2462   if (PreVectorPH->getNumSuccessors() != 1) {
2463     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2464     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2465            "Unexpected successor");
2466     VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2467     VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2468     PreVectorPH = CheckVPIRBB;
2469   }
2470   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2471   PreVectorPH->swapSuccessors();
2472 }
2473 
2474 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2475   Value *Count = getTripCount();
2476   // Reuse existing vector loop preheader for TC checks.
2477   // Note that new preheader block is generated for vector loop.
2478   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2479   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2480 
2481   // Generate code to check if the loop's trip count is less than VF * UF, or
2482   // equal to it in case a scalar epilogue is required; this implies that the
2483   // vector trip count is zero. This check also covers the case where adding one
2484   // to the backedge-taken count overflowed leading to an incorrect trip count
2485   // of zero. In this case we will also jump to the scalar loop.
2486   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2487                                                        : ICmpInst::ICMP_ULT;
2488 
2489   // If tail is to be folded, vector loop takes care of all iterations.
2490   Type *CountTy = Count->getType();
2491   Value *CheckMinIters = Builder.getFalse();
2492   auto CreateStep = [&]() -> Value * {
2493     // Create step with max(MinProTripCount, UF * VF).
2494     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2495       return createStepForVF(Builder, CountTy, VF, UF);
2496 
2497     Value *MinProfTC =
2498         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2499     if (!VF.isScalable())
2500       return MinProfTC;
2501     return Builder.CreateBinaryIntrinsic(
2502         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2503   };
2504 
2505   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2506   if (Style == TailFoldingStyle::None) {
2507     Value *Step = CreateStep();
2508     ScalarEvolution &SE = *PSE.getSE();
2509     // TODO: Emit unconditional branch to vector preheader instead of
2510     // conditional branch with known condition.
2511     const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2512     // Check if the trip count is < the step.
2513     if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2514       // TODO: Ensure step is at most the trip count when determining max VF and
2515       // UF, w/o tail folding.
2516       CheckMinIters = Builder.getTrue();
2517     } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),
2518                                     TripCountSCEV, SE.getSCEV(Step))) {
2519       // Generate the minimum iteration check only if we cannot prove the
2520       // check is known to be true, or known to be false.
2521       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2522     } // else step known to be < trip count, use CheckMinIters preset to false.
2523   } else if (VF.isScalable() &&
2524              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2525              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2526     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2527     // an overflow to zero when updating induction variables and so an
2528     // additional overflow check is required before entering the vector loop.
2529 
2530     // Get the maximum unsigned value for the type.
2531     Value *MaxUIntTripCount =
2532         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2533     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2534 
2535     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2536     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2537   }
2538 
2539   // Create new preheader for vector loop.
2540   LoopVectorPreHeader =
2541       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2542                  "vector.ph");
2543 
2544   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2545                                DT->getNode(Bypass)->getIDom()) &&
2546          "TC check is expected to dominate Bypass");
2547 
2548   BranchInst &BI =
2549       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2550   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2551     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2552   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2553   LoopBypassBlocks.push_back(TCCheckBlock);
2554 
2555   // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2556   introduceCheckBlockInVPlan(TCCheckBlock);
2557 }
2558 
2559 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2560   BasicBlock *const SCEVCheckBlock =
2561       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2562   if (!SCEVCheckBlock)
2563     return nullptr;
2564 
2565   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2566            (OptForSizeBasedOnProfile &&
2567             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2568          "Cannot SCEV check stride or overflow when optimizing for size");
2569   assert(!LoopBypassBlocks.empty() &&
2570          "Should already be a bypass block due to iteration count check");
2571   LoopBypassBlocks.push_back(SCEVCheckBlock);
2572   AddedSafetyChecks = true;
2573 
2574   introduceCheckBlockInVPlan(SCEVCheckBlock);
2575   return SCEVCheckBlock;
2576 }
2577 
2578 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2579   // VPlan-native path does not do any analysis for runtime checks currently.
2580   if (EnableVPlanNativePath)
2581     return nullptr;
2582 
2583   BasicBlock *const MemCheckBlock =
2584       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2585 
2586   // Check if we generated code that checks in runtime if arrays overlap. We put
2587   // the checks into a separate block to make the more common case of few
2588   // elements faster.
2589   if (!MemCheckBlock)
2590     return nullptr;
2591 
2592   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2593     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2594            "Cannot emit memory checks when optimizing for size, unless forced "
2595            "to vectorize.");
2596     ORE->emit([&]() {
2597       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2598                                         OrigLoop->getStartLoc(),
2599                                         OrigLoop->getHeader())
2600              << "Code-size may be reduced by not forcing "
2601                 "vectorization, or by source-code modifications "
2602                 "eliminating the need for runtime checks "
2603                 "(e.g., adding 'restrict').";
2604     });
2605   }
2606 
2607   LoopBypassBlocks.push_back(MemCheckBlock);
2608 
2609   AddedSafetyChecks = true;
2610 
2611   introduceCheckBlockInVPlan(MemCheckBlock);
2612   return MemCheckBlock;
2613 }
2614 
2615 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2616 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2617 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2618 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2619 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2620   VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2621   for (auto &R : make_early_inc_range(*VPBB)) {
2622     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2623     R.moveBefore(*IRVPBB, IRVPBB->end());
2624   }
2625 
2626   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2627   // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2628 }
2629 
2630 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2631   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2632   assert(LoopVectorPreHeader && "Invalid loop structure");
2633   assert((OrigLoop->getUniqueLatchExitBlock() ||
2634           Cost->requiresScalarEpilogue(VF.isVector())) &&
2635          "loops not exiting via the latch without required epilogue?");
2636 
2637   LoopMiddleBlock =
2638       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2639                  LI, nullptr, Twine(Prefix) + "middle.block");
2640   replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
2641   LoopScalarPreHeader =
2642       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2643                  nullptr, Twine(Prefix) + "scalar.ph");
2644   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
2645 }
2646 
2647 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2648 /// expansion results.
2649 static Value *getExpandedStep(const InductionDescriptor &ID,
2650                               const SCEV2ValueTy &ExpandedSCEVs) {
2651   const SCEV *Step = ID.getStep();
2652   if (auto *C = dyn_cast<SCEVConstant>(Step))
2653     return C->getValue();
2654   if (auto *U = dyn_cast<SCEVUnknown>(Step))
2655     return U->getValue();
2656   auto I = ExpandedSCEVs.find(Step);
2657   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2658   return I->second;
2659 }
2660 
2661 /// Knowing that loop \p L executes a single vector iteration, add instructions
2662 /// that will get simplified and thus should not have any cost to \p
2663 /// InstsToIgnore.
2664 static void addFullyUnrolledInstructionsToIgnore(
2665     Loop *L, const LoopVectorizationLegality::InductionList &IL,
2666     SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2667   auto *Cmp = L->getLatchCmpInst();
2668   if (Cmp)
2669     InstsToIgnore.insert(Cmp);
2670   for (const auto &KV : IL) {
2671     // Extract the key by hand so that it can be used in the lambda below.  Note
2672     // that captured structured bindings are a C++20 extension.
2673     const PHINode *IV = KV.first;
2674 
2675     // Get next iteration value of the induction variable.
2676     Instruction *IVInst =
2677         cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2678     if (all_of(IVInst->users(),
2679                [&](const User *U) { return U == IV || U == Cmp; }))
2680       InstsToIgnore.insert(IVInst);
2681   }
2682 }
2683 
2684 void InnerLoopVectorizer::createInductionAdditionalBypassValues(
2685     const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2686   assert(MainVectorTripCount && "Must have bypass information");
2687 
2688   Instruction *OldInduction = Legal->getPrimaryInduction();
2689   IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2690                             getAdditionalBypassBlock()->getFirstInsertionPt());
2691   for (const auto &InductionEntry : Legal->getInductionVars()) {
2692     PHINode *OrigPhi = InductionEntry.first;
2693     const InductionDescriptor &II = InductionEntry.second;
2694     Value *Step = getExpandedStep(II, ExpandedSCEVs);
2695     // For the primary induction the additional bypass end value is known.
2696     // Otherwise it is computed.
2697     Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2698     if (OrigPhi != OldInduction) {
2699       auto *BinOp = II.getInductionBinOp();
2700       // Fast-math-flags propagate from the original induction instruction.
2701       if (isa_and_nonnull<FPMathOperator>(BinOp))
2702         BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2703 
2704       // Compute the end value for the additional bypass.
2705       EndValueFromAdditionalBypass =
2706           emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2707                                II.getStartValue(), Step, II.getKind(), BinOp);
2708       EndValueFromAdditionalBypass->setName("ind.end");
2709     }
2710 
2711     // Store the bypass value here, as it needs to be added as operand to its
2712     // scalar preheader phi node after the epilogue skeleton has been created.
2713     // TODO: Directly add as extra operand to the VPResumePHI recipe.
2714     assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2715            "entry for OrigPhi already exits");
2716     Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2717   }
2718 }
2719 
2720 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2721     const SCEV2ValueTy &ExpandedSCEVs) {
2722   /*
2723    In this function we generate a new loop. The new loop will contain
2724    the vectorized instructions while the old loop will continue to run the
2725    scalar remainder.
2726 
2727        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2728      /  |      preheader are expanded here. Eventually all required SCEV
2729     /   |      expansion should happen here.
2730    /    v
2731   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2732   |  /  |
2733   | /   v
2734   ||   [ ]     <-- vector pre header.
2735   |/    |
2736   |     v
2737   |    [  ] \
2738   |    [  ]_|   <-- vector loop (created during VPlan execution).
2739   |     |
2740   |     v
2741   \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2742    |    |                       successors created during VPlan execution)
2743    \/   |
2744    /\   v
2745    | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
2746    |    |
2747  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
2748    |   [ ] \
2749    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue, header
2750    |    |          wrapped in VPIRBasicBlock).
2751     \   |
2752      \  v
2753       >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
2754    ...
2755    */
2756 
2757   // Create an empty vector loop, and prepare basic blocks for the runtime
2758   // checks.
2759   createVectorLoopSkeleton("");
2760 
2761   // Now, compare the new count to zero. If it is zero skip the vector loop and
2762   // jump to the scalar loop. This check also covers the case where the
2763   // backedge-taken count is uint##_max: adding one to it will overflow leading
2764   // to an incorrect trip count of zero. In this (rare) case we will also jump
2765   // to the scalar loop.
2766   emitIterationCountCheck(LoopScalarPreHeader);
2767 
2768   // Generate the code to check any assumptions that we've made for SCEV
2769   // expressions.
2770   emitSCEVChecks(LoopScalarPreHeader);
2771 
2772   // Generate the code that checks in runtime if arrays overlap. We put the
2773   // checks into a separate block to make the more common case of few elements
2774   // faster.
2775   emitMemRuntimeChecks(LoopScalarPreHeader);
2776 
2777   return LoopVectorPreHeader;
2778 }
2779 
2780 // Fix up external users of the induction variable. At this point, we are
2781 // in LCSSA form, with all external PHIs that use the IV having one input value,
2782 // coming from the remainder loop. We need those PHIs to also have a correct
2783 // value for the IV when arriving directly from the middle block.
2784 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2785                                        const InductionDescriptor &II,
2786                                        Value *VectorTripCount,
2787                                        BasicBlock *MiddleBlock,
2788                                        VPTransformState &State) {
2789   // There are two kinds of external IV usages - those that use the value
2790   // computed in the last iteration (the PHI) and those that use the penultimate
2791   // value (the value that feeds into the phi from the loop latch).
2792   // We allow both, but they, obviously, have different values.
2793 
2794   DenseMap<Value *, Value *> MissingVals;
2795 
2796   Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
2797                                       OrigLoop->getLoopPreheader()))
2798                         ->getIncomingValueForBlock(MiddleBlock);
2799 
2800   // An external user of the last iteration's value should see the value that
2801   // the remainder loop uses to initialize its own IV.
2802   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2803   for (User *U : PostInc->users()) {
2804     Instruction *UI = cast<Instruction>(U);
2805     if (!OrigLoop->contains(UI)) {
2806       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2807       MissingVals[UI] = EndValue;
2808     }
2809   }
2810 
2811   // An external user of the penultimate value need to see EndValue - Step.
2812   // The simplest way to get this is to recompute it from the constituent SCEVs,
2813   // that is Start + (Step * (CRD - 1)).
2814   for (User *U : OrigPhi->users()) {
2815     auto *UI = cast<Instruction>(U);
2816     if (!OrigLoop->contains(UI)) {
2817       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2818       IRBuilder<> B(MiddleBlock->getTerminator());
2819 
2820       // Fast-math-flags propagate from the original induction instruction.
2821       if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
2822         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2823 
2824       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2825       assert(StepVPV && "step must have been expanded during VPlan execution");
2826       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2827                                         : State.get(StepVPV, VPLane(0));
2828       Value *Escape = nullptr;
2829       if (EndValue->getType()->isIntegerTy())
2830         Escape = B.CreateSub(EndValue, Step);
2831       else if (EndValue->getType()->isPointerTy())
2832         Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
2833       else {
2834         assert(EndValue->getType()->isFloatingPointTy() &&
2835                "Unexpected induction type");
2836         Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
2837                                        Instruction::FAdd
2838                                    ? Instruction::FSub
2839                                    : Instruction::FAdd,
2840                                EndValue, Step);
2841       }
2842       Escape->setName("ind.escape");
2843       MissingVals[UI] = Escape;
2844     }
2845   }
2846 
2847   assert((MissingVals.empty() ||
2848           all_of(MissingVals,
2849                  [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
2850                    return all_of(
2851                        predecessors(cast<Instruction>(P.first)->getParent()),
2852                        [MiddleBlock, this](BasicBlock *Pred) {
2853                          return Pred == MiddleBlock ||
2854                                 Pred == OrigLoop->getLoopLatch();
2855                        });
2856                  })) &&
2857          "Expected escaping values from latch/middle.block only");
2858 
2859   for (auto &I : MissingVals) {
2860     PHINode *PHI = cast<PHINode>(I.first);
2861     // One corner case we have to handle is two IVs "chasing" each-other,
2862     // that is %IV2 = phi [...], [ %IV1, %latch ]
2863     // In this case, if IV1 has an external use, we need to avoid adding both
2864     // "last value of IV1" and "penultimate value of IV2". So, verify that we
2865     // don't already have an incoming value for the middle block.
2866     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2867       PHI->addIncoming(I.second, MiddleBlock);
2868   }
2869 }
2870 
2871 namespace {
2872 
2873 struct CSEDenseMapInfo {
2874   static bool canHandle(const Instruction *I) {
2875     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2876            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2877   }
2878 
2879   static inline Instruction *getEmptyKey() {
2880     return DenseMapInfo<Instruction *>::getEmptyKey();
2881   }
2882 
2883   static inline Instruction *getTombstoneKey() {
2884     return DenseMapInfo<Instruction *>::getTombstoneKey();
2885   }
2886 
2887   static unsigned getHashValue(const Instruction *I) {
2888     assert(canHandle(I) && "Unknown instruction!");
2889     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2890                                                            I->value_op_end()));
2891   }
2892 
2893   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2894     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2895         LHS == getTombstoneKey() || RHS == getTombstoneKey())
2896       return LHS == RHS;
2897     return LHS->isIdenticalTo(RHS);
2898   }
2899 };
2900 
2901 } // end anonymous namespace
2902 
2903 ///Perform cse of induction variable instructions.
2904 static void cse(BasicBlock *BB) {
2905   // Perform simple cse.
2906   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2907   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2908     if (!CSEDenseMapInfo::canHandle(&In))
2909       continue;
2910 
2911     // Check if we can replace this instruction with any of the
2912     // visited instructions.
2913     if (Instruction *V = CSEMap.lookup(&In)) {
2914       In.replaceAllUsesWith(V);
2915       In.eraseFromParent();
2916       continue;
2917     }
2918 
2919     CSEMap[&In] = &In;
2920   }
2921 }
2922 
2923 InstructionCost
2924 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2925                                               ElementCount VF) const {
2926   // We only need to calculate a cost if the VF is scalar; for actual vectors
2927   // we should already have a pre-calculated cost at each VF.
2928   if (!VF.isScalar())
2929     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2930 
2931   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2932   Type *RetTy = CI->getType();
2933   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
2934     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2935       return *RedCost;
2936 
2937   SmallVector<Type *, 4> Tys;
2938   for (auto &ArgOp : CI->args())
2939     Tys.push_back(ArgOp->getType());
2940 
2941   InstructionCost ScalarCallCost =
2942       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2943 
2944   // If this is an intrinsic we may have a lower cost for it.
2945   if (getVectorIntrinsicIDForCall(CI, TLI)) {
2946     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2947     return std::min(ScalarCallCost, IntrinsicCost);
2948   }
2949   return ScalarCallCost;
2950 }
2951 
2952 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2953   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2954     return Elt;
2955   return VectorType::get(Elt, VF);
2956 }
2957 
2958 InstructionCost
2959 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2960                                                    ElementCount VF) const {
2961   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2962   assert(ID && "Expected intrinsic call!");
2963   Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2964   FastMathFlags FMF;
2965   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2966     FMF = FPMO->getFastMathFlags();
2967 
2968   SmallVector<const Value *> Arguments(CI->args());
2969   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2970   SmallVector<Type *> ParamTys;
2971   std::transform(FTy->param_begin(), FTy->param_end(),
2972                  std::back_inserter(ParamTys),
2973                  [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2974 
2975   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2976                                     dyn_cast<IntrinsicInst>(CI));
2977   return TTI.getIntrinsicInstrCost(CostAttrs,
2978                                    TargetTransformInfo::TCK_RecipThroughput);
2979 }
2980 
2981 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2982   // Fix widened non-induction PHIs by setting up the PHI operands.
2983   if (EnableVPlanNativePath)
2984     fixNonInductionPHIs(State);
2985 
2986   // Forget the original basic block.
2987   PSE.getSE()->forgetLoop(OrigLoop);
2988   PSE.getSE()->forgetBlockAndLoopDispositions();
2989 
2990   // After vectorization, the exit blocks of the original loop will have
2991   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2992   // looked through single-entry phis.
2993   SmallVector<BasicBlock *> ExitBlocks;
2994   OrigLoop->getExitBlocks(ExitBlocks);
2995   for (BasicBlock *Exit : ExitBlocks)
2996     for (PHINode &PN : Exit->phis())
2997       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
2998 
2999   if (Cost->requiresScalarEpilogue(VF.isVector())) {
3000     // No edge from the middle block to the unique exit block has been inserted
3001     // and there is nothing to fix from vector loop; phis should have incoming
3002     // from scalar loop only.
3003   } else {
3004     // TODO: Check in VPlan to see if IV users need fixing instead of checking
3005     // the cost model.
3006 
3007     // If we inserted an edge from the middle block to the unique exit block,
3008     // update uses outside the loop (phis) to account for the newly inserted
3009     // edge.
3010 
3011     // Fix-up external users of the induction variables.
3012     for (const auto &Entry : Legal->getInductionVars())
3013       fixupIVUsers(Entry.first, Entry.second,
3014                    getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
3015   }
3016 
3017   // Don't apply optimizations below when no vector region remains, as they all
3018   // require a vector loop at the moment.
3019   if (!State.Plan->getVectorLoopRegion())
3020     return;
3021 
3022   for (Instruction *PI : PredicatedInstructions)
3023     sinkScalarOperands(&*PI);
3024 
3025   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3026   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3027   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
3028 
3029   // Remove redundant induction instructions.
3030   cse(HeaderBB);
3031 
3032   // Set/update profile weights for the vector and remainder loops as original
3033   // loop iterations are now distributed among them. Note that original loop
3034   // becomes the scalar remainder loop after vectorization.
3035   //
3036   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3037   // end up getting slightly roughened result but that should be OK since
3038   // profile is not inherently precise anyway. Note also possible bypass of
3039   // vector code caused by legality checks is ignored, assigning all the weight
3040   // to the vector loop, optimistically.
3041   //
3042   // For scalable vectorization we can't know at compile time how many
3043   // iterations of the loop are handled in one vector iteration, so instead
3044   // assume a pessimistic vscale of '1'.
3045   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
3046   setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
3047                                VF.getKnownMinValue() * UF);
3048 }
3049 
3050 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3051   // The basic block and loop containing the predicated instruction.
3052   auto *PredBB = PredInst->getParent();
3053   auto *VectorLoop = LI->getLoopFor(PredBB);
3054 
3055   // Initialize a worklist with the operands of the predicated instruction.
3056   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3057 
3058   // Holds instructions that we need to analyze again. An instruction may be
3059   // reanalyzed if we don't yet know if we can sink it or not.
3060   SmallVector<Instruction *, 8> InstsToReanalyze;
3061 
3062   // Returns true if a given use occurs in the predicated block. Phi nodes use
3063   // their operands in their corresponding predecessor blocks.
3064   auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
3065     auto *I = cast<Instruction>(U.getUser());
3066     BasicBlock *BB = I->getParent();
3067     if (auto *Phi = dyn_cast<PHINode>(I))
3068       BB = Phi->getIncomingBlock(
3069           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3070     return BB == PredBB;
3071   };
3072 
3073   // Iteratively sink the scalarized operands of the predicated instruction
3074   // into the block we created for it. When an instruction is sunk, it's
3075   // operands are then added to the worklist. The algorithm ends after one pass
3076   // through the worklist doesn't sink a single instruction.
3077   bool Changed;
3078   do {
3079     // Add the instructions that need to be reanalyzed to the worklist, and
3080     // reset the changed indicator.
3081     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3082     InstsToReanalyze.clear();
3083     Changed = false;
3084 
3085     while (!Worklist.empty()) {
3086       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3087 
3088       // We can't sink an instruction if it is a phi node, is not in the loop,
3089       // may have side effects or may read from memory.
3090       // TODO: Could do more granular checking to allow sinking
3091       // a load past non-store instructions.
3092       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3093           I->mayHaveSideEffects() || I->mayReadFromMemory())
3094           continue;
3095 
3096       // If the instruction is already in PredBB, check if we can sink its
3097       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3098       // sinking the scalar instruction I, hence it appears in PredBB; but it
3099       // may have failed to sink I's operands (recursively), which we try
3100       // (again) here.
3101       if (I->getParent() == PredBB) {
3102         Worklist.insert(I->op_begin(), I->op_end());
3103         continue;
3104       }
3105 
3106       // It's legal to sink the instruction if all its uses occur in the
3107       // predicated block. Otherwise, there's nothing to do yet, and we may
3108       // need to reanalyze the instruction.
3109       if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
3110         InstsToReanalyze.push_back(I);
3111         continue;
3112       }
3113 
3114       // Move the instruction to the beginning of the predicated block, and add
3115       // it's operands to the worklist.
3116       I->moveBefore(&*PredBB->getFirstInsertionPt());
3117       Worklist.insert(I->op_begin(), I->op_end());
3118 
3119       // The sinking may have enabled other instructions to be sunk, so we will
3120       // need to iterate.
3121       Changed = true;
3122     }
3123   } while (Changed);
3124 }
3125 
3126 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
3127   auto Iter = vp_depth_first_deep(Plan.getEntry());
3128   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3129     for (VPRecipeBase &P : VPBB->phis()) {
3130       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3131       if (!VPPhi)
3132         continue;
3133       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3134       // Make sure the builder has a valid insert point.
3135       Builder.SetInsertPoint(NewPhi);
3136       for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3137         VPValue *Inc = VPPhi->getIncomingValue(Idx);
3138         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3139         NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3140       }
3141     }
3142   }
3143 }
3144 
3145 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3146   // We should not collect Scalars more than once per VF. Right now, this
3147   // function is called from collectUniformsAndScalars(), which already does
3148   // this check. Collecting Scalars for VF=1 does not make any sense.
3149   assert(VF.isVector() && !Scalars.contains(VF) &&
3150          "This function should not be visited twice for the same VF");
3151 
3152   // This avoids any chances of creating a REPLICATE recipe during planning
3153   // since that would result in generation of scalarized code during execution,
3154   // which is not supported for scalable vectors.
3155   if (VF.isScalable()) {
3156     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3157     return;
3158   }
3159 
3160   SmallSetVector<Instruction *, 8> Worklist;
3161 
3162   // These sets are used to seed the analysis with pointers used by memory
3163   // accesses that will remain scalar.
3164   SmallSetVector<Instruction *, 8> ScalarPtrs;
3165   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3166   auto *Latch = TheLoop->getLoopLatch();
3167 
3168   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3169   // The pointer operands of loads and stores will be scalar as long as the
3170   // memory access is not a gather or scatter operation. The value operand of a
3171   // store will remain scalar if the store is scalarized.
3172   auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3173     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3174     assert(WideningDecision != CM_Unknown &&
3175            "Widening decision should be ready at this moment");
3176     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3177       if (Ptr == Store->getValueOperand())
3178         return WideningDecision == CM_Scalarize;
3179     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3180            "Ptr is neither a value or pointer operand");
3181     return WideningDecision != CM_GatherScatter;
3182   };
3183 
3184   // A helper that returns true if the given value is a getelementptr
3185   // instruction contained in the loop.
3186   auto IsLoopVaryingGEP = [&](Value *V) {
3187     return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3188   };
3189 
3190   // A helper that evaluates a memory access's use of a pointer. If the use will
3191   // be a scalar use and the pointer is only used by memory accesses, we place
3192   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3193   // PossibleNonScalarPtrs.
3194   auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3195     // We only care about bitcast and getelementptr instructions contained in
3196     // the loop.
3197     if (!IsLoopVaryingGEP(Ptr))
3198       return;
3199 
3200     // If the pointer has already been identified as scalar (e.g., if it was
3201     // also identified as uniform), there's nothing to do.
3202     auto *I = cast<Instruction>(Ptr);
3203     if (Worklist.count(I))
3204       return;
3205 
3206     // If the use of the pointer will be a scalar use, and all users of the
3207     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3208     // place the pointer in PossibleNonScalarPtrs.
3209     if (IsScalarUse(MemAccess, Ptr) &&
3210         all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3211       ScalarPtrs.insert(I);
3212     else
3213       PossibleNonScalarPtrs.insert(I);
3214   };
3215 
3216   // We seed the scalars analysis with three classes of instructions: (1)
3217   // instructions marked uniform-after-vectorization and (2) bitcast,
3218   // getelementptr and (pointer) phi instructions used by memory accesses
3219   // requiring a scalar use.
3220   //
3221   // (1) Add to the worklist all instructions that have been identified as
3222   // uniform-after-vectorization.
3223   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3224 
3225   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3226   // memory accesses requiring a scalar use. The pointer operands of loads and
3227   // stores will be scalar unless the operation is a gather or scatter.
3228   // The value operand of a store will remain scalar if the store is scalarized.
3229   for (auto *BB : TheLoop->blocks())
3230     for (auto &I : *BB) {
3231       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3232         EvaluatePtrUse(Load, Load->getPointerOperand());
3233       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3234         EvaluatePtrUse(Store, Store->getPointerOperand());
3235         EvaluatePtrUse(Store, Store->getValueOperand());
3236       }
3237     }
3238   for (auto *I : ScalarPtrs)
3239     if (!PossibleNonScalarPtrs.count(I)) {
3240       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3241       Worklist.insert(I);
3242     }
3243 
3244   // Insert the forced scalars.
3245   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3246   // induction variable when the PHI user is scalarized.
3247   auto ForcedScalar = ForcedScalars.find(VF);
3248   if (ForcedScalar != ForcedScalars.end())
3249     for (auto *I : ForcedScalar->second) {
3250       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3251       Worklist.insert(I);
3252     }
3253 
3254   // Expand the worklist by looking through any bitcasts and getelementptr
3255   // instructions we've already identified as scalar. This is similar to the
3256   // expansion step in collectLoopUniforms(); however, here we're only
3257   // expanding to include additional bitcasts and getelementptr instructions.
3258   unsigned Idx = 0;
3259   while (Idx != Worklist.size()) {
3260     Instruction *Dst = Worklist[Idx++];
3261     if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3262       continue;
3263     auto *Src = cast<Instruction>(Dst->getOperand(0));
3264     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3265           auto *J = cast<Instruction>(U);
3266           return !TheLoop->contains(J) || Worklist.count(J) ||
3267                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3268                   IsScalarUse(J, Src));
3269         })) {
3270       Worklist.insert(Src);
3271       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3272     }
3273   }
3274 
3275   // An induction variable will remain scalar if all users of the induction
3276   // variable and induction variable update remain scalar.
3277   for (const auto &Induction : Legal->getInductionVars()) {
3278     auto *Ind = Induction.first;
3279     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3280 
3281     // If tail-folding is applied, the primary induction variable will be used
3282     // to feed a vector compare.
3283     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3284       continue;
3285 
3286     // Returns true if \p Indvar is a pointer induction that is used directly by
3287     // load/store instruction \p I.
3288     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3289                                               Instruction *I) {
3290       return Induction.second.getKind() ==
3291                  InductionDescriptor::IK_PtrInduction &&
3292              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3293              Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3294     };
3295 
3296     // Determine if all users of the induction variable are scalar after
3297     // vectorization.
3298     bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3299       auto *I = cast<Instruction>(U);
3300       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3301              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3302     });
3303     if (!ScalarInd)
3304       continue;
3305 
3306     // If the induction variable update is a fixed-order recurrence, neither the
3307     // induction variable or its update should be marked scalar after
3308     // vectorization.
3309     auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3310     if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3311       continue;
3312 
3313     // Determine if all users of the induction variable update instruction are
3314     // scalar after vectorization.
3315     bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3316       auto *I = cast<Instruction>(U);
3317       return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3318              IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3319     });
3320     if (!ScalarIndUpdate)
3321       continue;
3322 
3323     // The induction variable and its update instruction will remain scalar.
3324     Worklist.insert(Ind);
3325     Worklist.insert(IndUpdate);
3326     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3327     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3328                       << "\n");
3329   }
3330 
3331   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3332 }
3333 
3334 bool LoopVectorizationCostModel::isScalarWithPredication(
3335     Instruction *I, ElementCount VF) const {
3336   if (!isPredicatedInst(I))
3337     return false;
3338 
3339   // Do we have a non-scalar lowering for this predicated
3340   // instruction? No - it is scalar with predication.
3341   switch(I->getOpcode()) {
3342   default:
3343     return true;
3344   case Instruction::Call:
3345     if (VF.isScalar())
3346       return true;
3347     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3348                .Kind == CM_Scalarize;
3349   case Instruction::Load:
3350   case Instruction::Store: {
3351     auto *Ptr = getLoadStorePointerOperand(I);
3352     auto *Ty = getLoadStoreType(I);
3353     Type *VTy = Ty;
3354     if (VF.isVector())
3355       VTy = VectorType::get(Ty, VF);
3356     const Align Alignment = getLoadStoreAlignment(I);
3357     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3358                                 TTI.isLegalMaskedGather(VTy, Alignment))
3359                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3360                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3361   }
3362   case Instruction::UDiv:
3363   case Instruction::SDiv:
3364   case Instruction::SRem:
3365   case Instruction::URem: {
3366     // We have the option to use the safe-divisor idiom to avoid predication.
3367     // The cost based decision here will always select safe-divisor for
3368     // scalable vectors as scalarization isn't legal.
3369     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3370     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3371   }
3372   }
3373 }
3374 
3375 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3376 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3377   // If predication is not needed, avoid it.
3378   // TODO: We can use the loop-preheader as context point here and get
3379   // context sensitive reasoning for isSafeToSpeculativelyExecute.
3380   if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3381       isSafeToSpeculativelyExecute(I) ||
3382       (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3383       isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3384     return false;
3385 
3386   // If the instruction was executed conditionally in the original scalar loop,
3387   // predication is needed with a mask whose lanes are all possibly inactive.
3388   if (Legal->blockNeedsPredication(I->getParent()))
3389     return true;
3390 
3391   // All that remain are instructions with side-effects originally executed in
3392   // the loop unconditionally, but now execute under a tail-fold mask (only)
3393   // having at least one active lane (the first). If the side-effects of the
3394   // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3395   // - it will cause the same side-effects as when masked.
3396   switch(I->getOpcode()) {
3397   default:
3398     llvm_unreachable(
3399         "instruction should have been considered by earlier checks");
3400   case Instruction::Call:
3401     // Side-effects of a Call are assumed to be non-invariant, needing a
3402     // (fold-tail) mask.
3403     assert(Legal->isMaskRequired(I) &&
3404            "should have returned earlier for calls not needing a mask");
3405     return true;
3406   case Instruction::Load:
3407     // If the address is loop invariant no predication is needed.
3408     return !Legal->isInvariant(getLoadStorePointerOperand(I));
3409   case Instruction::Store: {
3410     // For stores, we need to prove both speculation safety (which follows from
3411     // the same argument as loads), but also must prove the value being stored
3412     // is correct.  The easiest form of the later is to require that all values
3413     // stored are the same.
3414     return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3415              TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3416   }
3417   case Instruction::UDiv:
3418   case Instruction::SDiv:
3419   case Instruction::SRem:
3420   case Instruction::URem:
3421     // If the divisor is loop-invariant no predication is needed.
3422     return !TheLoop->isLoopInvariant(I->getOperand(1));
3423   }
3424 }
3425 
3426 std::pair<InstructionCost, InstructionCost>
3427 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3428                                                     ElementCount VF) const {
3429   assert(I->getOpcode() == Instruction::UDiv ||
3430          I->getOpcode() == Instruction::SDiv ||
3431          I->getOpcode() == Instruction::SRem ||
3432          I->getOpcode() == Instruction::URem);
3433   assert(!isSafeToSpeculativelyExecute(I));
3434 
3435   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3436 
3437   // Scalarization isn't legal for scalable vector types
3438   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3439   if (!VF.isScalable()) {
3440     // Get the scalarization cost and scale this amount by the probability of
3441     // executing the predicated block. If the instruction is not predicated,
3442     // we fall through to the next case.
3443     ScalarizationCost = 0;
3444 
3445     // These instructions have a non-void type, so account for the phi nodes
3446     // that we will create. This cost is likely to be zero. The phi node
3447     // cost, if any, should be scaled by the block probability because it
3448     // models a copy at the end of each predicated block.
3449     ScalarizationCost += VF.getKnownMinValue() *
3450       TTI.getCFInstrCost(Instruction::PHI, CostKind);
3451 
3452     // The cost of the non-predicated instruction.
3453     ScalarizationCost += VF.getKnownMinValue() *
3454       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3455 
3456     // The cost of insertelement and extractelement instructions needed for
3457     // scalarization.
3458     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3459 
3460     // Scale the cost by the probability of executing the predicated blocks.
3461     // This assumes the predicated block for each vector lane is equally
3462     // likely.
3463     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3464   }
3465   InstructionCost SafeDivisorCost = 0;
3466 
3467   auto *VecTy = toVectorTy(I->getType(), VF);
3468 
3469   // The cost of the select guard to ensure all lanes are well defined
3470   // after we speculate above any internal control flow.
3471   SafeDivisorCost +=
3472       TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3473                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3474                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
3475 
3476   // Certain instructions can be cheaper to vectorize if they have a constant
3477   // second vector operand. One example of this are shifts on x86.
3478   Value *Op2 = I->getOperand(1);
3479   auto Op2Info = TTI.getOperandInfo(Op2);
3480   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3481       Legal->isInvariant(Op2))
3482     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3483 
3484   SmallVector<const Value *, 4> Operands(I->operand_values());
3485   SafeDivisorCost += TTI.getArithmeticInstrCost(
3486     I->getOpcode(), VecTy, CostKind,
3487     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3488     Op2Info, Operands, I);
3489   return {ScalarizationCost, SafeDivisorCost};
3490 }
3491 
3492 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3493     Instruction *I, ElementCount VF) const {
3494   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3495   assert(getWideningDecision(I, VF) == CM_Unknown &&
3496          "Decision should not be set yet.");
3497   auto *Group = getInterleavedAccessGroup(I);
3498   assert(Group && "Must have a group.");
3499   unsigned InterleaveFactor = Group->getFactor();
3500 
3501   // If the instruction's allocated size doesn't equal its type size, it
3502   // requires padding and will be scalarized.
3503   auto &DL = I->getDataLayout();
3504   auto *ScalarTy = getLoadStoreType(I);
3505   if (hasIrregularType(ScalarTy, DL))
3506     return false;
3507 
3508   // We currently only know how to emit interleave/deinterleave with
3509   // Factor=2 for scalable vectors. This is purely an implementation
3510   // limit.
3511   if (VF.isScalable() && InterleaveFactor != 2)
3512     return false;
3513 
3514   // If the group involves a non-integral pointer, we may not be able to
3515   // losslessly cast all values to a common type.
3516   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3517   for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3518     Instruction *Member = Group->getMember(Idx);
3519     if (!Member)
3520       continue;
3521     auto *MemberTy = getLoadStoreType(Member);
3522     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3523     // Don't coerce non-integral pointers to integers or vice versa.
3524     if (MemberNI != ScalarNI)
3525       // TODO: Consider adding special nullptr value case here
3526       return false;
3527     if (MemberNI && ScalarNI &&
3528         ScalarTy->getPointerAddressSpace() !=
3529             MemberTy->getPointerAddressSpace())
3530       return false;
3531   }
3532 
3533   // Check if masking is required.
3534   // A Group may need masking for one of two reasons: it resides in a block that
3535   // needs predication, or it was decided to use masking to deal with gaps
3536   // (either a gap at the end of a load-access that may result in a speculative
3537   // load, or any gaps in a store-access).
3538   bool PredicatedAccessRequiresMasking =
3539       blockNeedsPredicationForAnyReason(I->getParent()) &&
3540       Legal->isMaskRequired(I);
3541   bool LoadAccessWithGapsRequiresEpilogMasking =
3542       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3543       !isScalarEpilogueAllowed();
3544   bool StoreAccessWithGapsRequiresMasking =
3545       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3546   if (!PredicatedAccessRequiresMasking &&
3547       !LoadAccessWithGapsRequiresEpilogMasking &&
3548       !StoreAccessWithGapsRequiresMasking)
3549     return true;
3550 
3551   // If masked interleaving is required, we expect that the user/target had
3552   // enabled it, because otherwise it either wouldn't have been created or
3553   // it should have been invalidated by the CostModel.
3554   assert(useMaskedInterleavedAccesses(TTI) &&
3555          "Masked interleave-groups for predicated accesses are not enabled.");
3556 
3557   if (Group->isReverse())
3558     return false;
3559 
3560   auto *Ty = getLoadStoreType(I);
3561   const Align Alignment = getLoadStoreAlignment(I);
3562   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3563                           : TTI.isLegalMaskedStore(Ty, Alignment);
3564 }
3565 
3566 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3567     Instruction *I, ElementCount VF) {
3568   // Get and ensure we have a valid memory instruction.
3569   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3570 
3571   auto *Ptr = getLoadStorePointerOperand(I);
3572   auto *ScalarTy = getLoadStoreType(I);
3573 
3574   // In order to be widened, the pointer should be consecutive, first of all.
3575   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3576     return false;
3577 
3578   // If the instruction is a store located in a predicated block, it will be
3579   // scalarized.
3580   if (isScalarWithPredication(I, VF))
3581     return false;
3582 
3583   // If the instruction's allocated size doesn't equal it's type size, it
3584   // requires padding and will be scalarized.
3585   auto &DL = I->getDataLayout();
3586   if (hasIrregularType(ScalarTy, DL))
3587     return false;
3588 
3589   return true;
3590 }
3591 
3592 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3593   // We should not collect Uniforms more than once per VF. Right now,
3594   // this function is called from collectUniformsAndScalars(), which
3595   // already does this check. Collecting Uniforms for VF=1 does not make any
3596   // sense.
3597 
3598   assert(VF.isVector() && !Uniforms.contains(VF) &&
3599          "This function should not be visited twice for the same VF");
3600 
3601   // Visit the list of Uniforms. If we find no uniform value, we won't
3602   // analyze again.  Uniforms.count(VF) will return 1.
3603   Uniforms[VF].clear();
3604 
3605   // Now we know that the loop is vectorizable!
3606   // Collect instructions inside the loop that will remain uniform after
3607   // vectorization.
3608 
3609   // Global values, params and instructions outside of current loop are out of
3610   // scope.
3611   auto IsOutOfScope = [&](Value *V) -> bool {
3612     Instruction *I = dyn_cast<Instruction>(V);
3613     return (!I || !TheLoop->contains(I));
3614   };
3615 
3616   // Worklist containing uniform instructions demanding lane 0.
3617   SetVector<Instruction *> Worklist;
3618 
3619   // Add uniform instructions demanding lane 0 to the worklist. Instructions
3620   // that require predication must not be considered uniform after
3621   // vectorization, because that would create an erroneous replicating region
3622   // where only a single instance out of VF should be formed.
3623   auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3624     if (IsOutOfScope(I)) {
3625       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3626                         << *I << "\n");
3627       return;
3628     }
3629     if (isPredicatedInst(I)) {
3630       LLVM_DEBUG(
3631           dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3632                  << "\n");
3633       return;
3634     }
3635     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3636     Worklist.insert(I);
3637   };
3638 
3639   // Start with the conditional branches exiting the loop. If the branch
3640   // condition is an instruction contained in the loop that is only used by the
3641   // branch, it is uniform. Note conditions from uncountable early exits are not
3642   // uniform.
3643   SmallVector<BasicBlock *> Exiting;
3644   TheLoop->getExitingBlocks(Exiting);
3645   for (BasicBlock *E : Exiting) {
3646     if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3647       continue;
3648     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3649     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3650       AddToWorklistIfAllowed(Cmp);
3651   }
3652 
3653   auto PrevVF = VF.divideCoefficientBy(2);
3654   // Return true if all lanes perform the same memory operation, and we can
3655   // thus choose to execute only one.
3656   auto IsUniformMemOpUse = [&](Instruction *I) {
3657     // If the value was already known to not be uniform for the previous
3658     // (smaller VF), it cannot be uniform for the larger VF.
3659     if (PrevVF.isVector()) {
3660       auto Iter = Uniforms.find(PrevVF);
3661       if (Iter != Uniforms.end() && !Iter->second.contains(I))
3662         return false;
3663     }
3664     if (!Legal->isUniformMemOp(*I, VF))
3665       return false;
3666     if (isa<LoadInst>(I))
3667       // Loading the same address always produces the same result - at least
3668       // assuming aliasing and ordering which have already been checked.
3669       return true;
3670     // Storing the same value on every iteration.
3671     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3672   };
3673 
3674   auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3675     InstWidening WideningDecision = getWideningDecision(I, VF);
3676     assert(WideningDecision != CM_Unknown &&
3677            "Widening decision should be ready at this moment");
3678 
3679     if (IsUniformMemOpUse(I))
3680       return true;
3681 
3682     return (WideningDecision == CM_Widen ||
3683             WideningDecision == CM_Widen_Reverse ||
3684             WideningDecision == CM_Interleave);
3685   };
3686 
3687   // Returns true if Ptr is the pointer operand of a memory access instruction
3688   // I, I is known to not require scalarization, and the pointer is not also
3689   // stored.
3690   auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3691     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3692       return false;
3693     return getLoadStorePointerOperand(I) == Ptr &&
3694            (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3695   };
3696 
3697   // Holds a list of values which are known to have at least one uniform use.
3698   // Note that there may be other uses which aren't uniform.  A "uniform use"
3699   // here is something which only demands lane 0 of the unrolled iterations;
3700   // it does not imply that all lanes produce the same value (e.g. this is not
3701   // the usual meaning of uniform)
3702   SetVector<Value *> HasUniformUse;
3703 
3704   // Scan the loop for instructions which are either a) known to have only
3705   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3706   for (auto *BB : TheLoop->blocks())
3707     for (auto &I : *BB) {
3708       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3709         switch (II->getIntrinsicID()) {
3710         case Intrinsic::sideeffect:
3711         case Intrinsic::experimental_noalias_scope_decl:
3712         case Intrinsic::assume:
3713         case Intrinsic::lifetime_start:
3714         case Intrinsic::lifetime_end:
3715           if (TheLoop->hasLoopInvariantOperands(&I))
3716             AddToWorklistIfAllowed(&I);
3717           break;
3718         default:
3719           break;
3720         }
3721       }
3722 
3723       // ExtractValue instructions must be uniform, because the operands are
3724       // known to be loop-invariant.
3725       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3726         assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3727                "Expected aggregate value to be loop invariant");
3728         AddToWorklistIfAllowed(EVI);
3729         continue;
3730       }
3731 
3732       // If there's no pointer operand, there's nothing to do.
3733       auto *Ptr = getLoadStorePointerOperand(&I);
3734       if (!Ptr)
3735         continue;
3736 
3737       if (IsUniformMemOpUse(&I))
3738         AddToWorklistIfAllowed(&I);
3739 
3740       if (IsVectorizedMemAccessUse(&I, Ptr))
3741         HasUniformUse.insert(Ptr);
3742     }
3743 
3744   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3745   // demanding) users.  Since loops are assumed to be in LCSSA form, this
3746   // disallows uses outside the loop as well.
3747   for (auto *V : HasUniformUse) {
3748     if (IsOutOfScope(V))
3749       continue;
3750     auto *I = cast<Instruction>(V);
3751     bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3752       auto *UI = cast<Instruction>(U);
3753       return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3754     });
3755     if (UsersAreMemAccesses)
3756       AddToWorklistIfAllowed(I);
3757   }
3758 
3759   // Expand Worklist in topological order: whenever a new instruction
3760   // is added , its users should be already inside Worklist.  It ensures
3761   // a uniform instruction will only be used by uniform instructions.
3762   unsigned Idx = 0;
3763   while (Idx != Worklist.size()) {
3764     Instruction *I = Worklist[Idx++];
3765 
3766     for (auto *OV : I->operand_values()) {
3767       // isOutOfScope operands cannot be uniform instructions.
3768       if (IsOutOfScope(OV))
3769         continue;
3770       // First order recurrence Phi's should typically be considered
3771       // non-uniform.
3772       auto *OP = dyn_cast<PHINode>(OV);
3773       if (OP && Legal->isFixedOrderRecurrence(OP))
3774         continue;
3775       // If all the users of the operand are uniform, then add the
3776       // operand into the uniform worklist.
3777       auto *OI = cast<Instruction>(OV);
3778       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3779             auto *J = cast<Instruction>(U);
3780             return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3781           }))
3782         AddToWorklistIfAllowed(OI);
3783     }
3784   }
3785 
3786   // For an instruction to be added into Worklist above, all its users inside
3787   // the loop should also be in Worklist. However, this condition cannot be
3788   // true for phi nodes that form a cyclic dependence. We must process phi
3789   // nodes separately. An induction variable will remain uniform if all users
3790   // of the induction variable and induction variable update remain uniform.
3791   // The code below handles both pointer and non-pointer induction variables.
3792   BasicBlock *Latch = TheLoop->getLoopLatch();
3793   for (const auto &Induction : Legal->getInductionVars()) {
3794     auto *Ind = Induction.first;
3795     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3796 
3797     // Determine if all users of the induction variable are uniform after
3798     // vectorization.
3799     bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3800       auto *I = cast<Instruction>(U);
3801       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3802              IsVectorizedMemAccessUse(I, Ind);
3803     });
3804     if (!UniformInd)
3805       continue;
3806 
3807     // Determine if all users of the induction variable update instruction are
3808     // uniform after vectorization.
3809     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3810       auto *I = cast<Instruction>(U);
3811       return I == Ind || Worklist.count(I) ||
3812              IsVectorizedMemAccessUse(I, IndUpdate);
3813     });
3814     if (!UniformIndUpdate)
3815       continue;
3816 
3817     // The induction variable and its update instruction will remain uniform.
3818     AddToWorklistIfAllowed(Ind);
3819     AddToWorklistIfAllowed(IndUpdate);
3820   }
3821 
3822   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3823 }
3824 
3825 bool LoopVectorizationCostModel::runtimeChecksRequired() {
3826   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3827 
3828   if (Legal->getRuntimePointerChecking()->Need) {
3829     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3830         "runtime pointer checks needed. Enable vectorization of this "
3831         "loop with '#pragma clang loop vectorize(enable)' when "
3832         "compiling with -Os/-Oz",
3833         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3834     return true;
3835   }
3836 
3837   if (!PSE.getPredicate().isAlwaysTrue()) {
3838     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3839         "runtime SCEV checks needed. Enable vectorization of this "
3840         "loop with '#pragma clang loop vectorize(enable)' when "
3841         "compiling with -Os/-Oz",
3842         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3843     return true;
3844   }
3845 
3846   // FIXME: Avoid specializing for stride==1 instead of bailing out.
3847   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3848     reportVectorizationFailure("Runtime stride check for small trip count",
3849         "runtime stride == 1 checks needed. Enable vectorization of "
3850         "this loop without such check by compiling with -Os/-Oz",
3851         "CantVersionLoopWithOptForSize", ORE, TheLoop);
3852     return true;
3853   }
3854 
3855   return false;
3856 }
3857 
3858 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3859   if (IsScalableVectorizationAllowed)
3860     return *IsScalableVectorizationAllowed;
3861 
3862   IsScalableVectorizationAllowed = false;
3863   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3864     return false;
3865 
3866   if (Hints->isScalableVectorizationDisabled()) {
3867     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3868                             "ScalableVectorizationDisabled", ORE, TheLoop);
3869     return false;
3870   }
3871 
3872   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3873 
3874   auto MaxScalableVF = ElementCount::getScalable(
3875       std::numeric_limits<ElementCount::ScalarTy>::max());
3876 
3877   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3878   // FIXME: While for scalable vectors this is currently sufficient, this should
3879   // be replaced by a more detailed mechanism that filters out specific VFs,
3880   // instead of invalidating vectorization for a whole set of VFs based on the
3881   // MaxVF.
3882 
3883   // Disable scalable vectorization if the loop contains unsupported reductions.
3884   if (!canVectorizeReductions(MaxScalableVF)) {
3885     reportVectorizationInfo(
3886         "Scalable vectorization not supported for the reduction "
3887         "operations found in this loop.",
3888         "ScalableVFUnfeasible", ORE, TheLoop);
3889     return false;
3890   }
3891 
3892   // Disable scalable vectorization if the loop contains any instructions
3893   // with element types not supported for scalable vectors.
3894   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3895         return !Ty->isVoidTy() &&
3896                !this->TTI.isElementTypeLegalForScalableVector(Ty);
3897       })) {
3898     reportVectorizationInfo("Scalable vectorization is not supported "
3899                             "for all element types found in this loop.",
3900                             "ScalableVFUnfeasible", ORE, TheLoop);
3901     return false;
3902   }
3903 
3904   if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3905     reportVectorizationInfo("The target does not provide maximum vscale value "
3906                             "for safe distance analysis.",
3907                             "ScalableVFUnfeasible", ORE, TheLoop);
3908     return false;
3909   }
3910 
3911   IsScalableVectorizationAllowed = true;
3912   return true;
3913 }
3914 
3915 ElementCount
3916 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3917   if (!isScalableVectorizationAllowed())
3918     return ElementCount::getScalable(0);
3919 
3920   auto MaxScalableVF = ElementCount::getScalable(
3921       std::numeric_limits<ElementCount::ScalarTy>::max());
3922   if (Legal->isSafeForAnyVectorWidth())
3923     return MaxScalableVF;
3924 
3925   std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3926   // Limit MaxScalableVF by the maximum safe dependence distance.
3927   MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3928 
3929   if (!MaxScalableVF)
3930     reportVectorizationInfo(
3931         "Max legal vector width too small, scalable vectorization "
3932         "unfeasible.",
3933         "ScalableVFUnfeasible", ORE, TheLoop);
3934 
3935   return MaxScalableVF;
3936 }
3937 
3938 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3939     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3940   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3941   unsigned SmallestType, WidestType;
3942   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3943 
3944   // Get the maximum safe dependence distance in bits computed by LAA.
3945   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3946   // the memory accesses that is most restrictive (involved in the smallest
3947   // dependence distance).
3948   unsigned MaxSafeElements =
3949       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3950 
3951   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3952   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3953   if (!Legal->isSafeForAnyVectorWidth())
3954     this->MaxSafeElements = MaxSafeElements;
3955 
3956   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3957                     << ".\n");
3958   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3959                     << ".\n");
3960 
3961   // First analyze the UserVF, fall back if the UserVF should be ignored.
3962   if (UserVF) {
3963     auto MaxSafeUserVF =
3964         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3965 
3966     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3967       // If `VF=vscale x N` is safe, then so is `VF=N`
3968       if (UserVF.isScalable())
3969         return FixedScalableVFPair(
3970             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3971 
3972       return UserVF;
3973     }
3974 
3975     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3976 
3977     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3978     // is better to ignore the hint and let the compiler choose a suitable VF.
3979     if (!UserVF.isScalable()) {
3980       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3981                         << " is unsafe, clamping to max safe VF="
3982                         << MaxSafeFixedVF << ".\n");
3983       ORE->emit([&]() {
3984         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3985                                           TheLoop->getStartLoc(),
3986                                           TheLoop->getHeader())
3987                << "User-specified vectorization factor "
3988                << ore::NV("UserVectorizationFactor", UserVF)
3989                << " is unsafe, clamping to maximum safe vectorization factor "
3990                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3991       });
3992       return MaxSafeFixedVF;
3993     }
3994 
3995     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3996       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3997                         << " is ignored because scalable vectors are not "
3998                            "available.\n");
3999       ORE->emit([&]() {
4000         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4001                                           TheLoop->getStartLoc(),
4002                                           TheLoop->getHeader())
4003                << "User-specified vectorization factor "
4004                << ore::NV("UserVectorizationFactor", UserVF)
4005                << " is ignored because the target does not support scalable "
4006                   "vectors. The compiler will pick a more suitable value.";
4007       });
4008     } else {
4009       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4010                         << " is unsafe. Ignoring scalable UserVF.\n");
4011       ORE->emit([&]() {
4012         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4013                                           TheLoop->getStartLoc(),
4014                                           TheLoop->getHeader())
4015                << "User-specified vectorization factor "
4016                << ore::NV("UserVectorizationFactor", UserVF)
4017                << " is unsafe. Ignoring the hint to let the compiler pick a "
4018                   "more suitable value.";
4019       });
4020     }
4021   }
4022 
4023   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4024                     << " / " << WidestType << " bits.\n");
4025 
4026   FixedScalableVFPair Result(ElementCount::getFixed(1),
4027                              ElementCount::getScalable(0));
4028   if (auto MaxVF =
4029           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4030                                   MaxSafeFixedVF, FoldTailByMasking))
4031     Result.FixedVF = MaxVF;
4032 
4033   if (auto MaxVF =
4034           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4035                                   MaxSafeScalableVF, FoldTailByMasking))
4036     if (MaxVF.isScalable()) {
4037       Result.ScalableVF = MaxVF;
4038       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4039                         << "\n");
4040     }
4041 
4042   return Result;
4043 }
4044 
4045 FixedScalableVFPair
4046 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4047   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4048     // TODO: It may be useful to do since it's still likely to be dynamically
4049     // uniform if the target can skip.
4050     reportVectorizationFailure(
4051         "Not inserting runtime ptr check for divergent target",
4052         "runtime pointer checks needed. Not enabled for divergent target",
4053         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4054     return FixedScalableVFPair::getNone();
4055   }
4056 
4057   ScalarEvolution *SE = PSE.getSE();
4058   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
4059   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
4060   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4061   if (TC != MaxTC)
4062     LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
4063   if (TC == 1) {
4064     reportVectorizationFailure("Single iteration (non) loop",
4065         "loop trip count is one, irrelevant for vectorization",
4066         "SingleIterationLoop", ORE, TheLoop);
4067     return FixedScalableVFPair::getNone();
4068   }
4069 
4070   // If BTC matches the widest induction type and is -1 then the trip count
4071   // computation will wrap to 0 and the vector trip count will be 0. Do not try
4072   // to vectorize.
4073   const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
4074   if (!isa<SCEVCouldNotCompute>(BTC) &&
4075       BTC->getType()->getScalarSizeInBits() >=
4076           Legal->getWidestInductionType()->getScalarSizeInBits() &&
4077       SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,
4078                            SE->getMinusOne(BTC->getType()))) {
4079     reportVectorizationFailure(
4080         "Trip count computation wrapped",
4081         "backedge-taken count is -1, loop trip count wrapped to 0",
4082         "TripCountWrapped", ORE, TheLoop);
4083     return FixedScalableVFPair::getNone();
4084   }
4085 
4086   switch (ScalarEpilogueStatus) {
4087   case CM_ScalarEpilogueAllowed:
4088     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4089   case CM_ScalarEpilogueNotAllowedUsePredicate:
4090     [[fallthrough]];
4091   case CM_ScalarEpilogueNotNeededUsePredicate:
4092     LLVM_DEBUG(
4093         dbgs() << "LV: vector predicate hint/switch found.\n"
4094                << "LV: Not allowing scalar epilogue, creating predicated "
4095                << "vector loop.\n");
4096     break;
4097   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4098     // fallthrough as a special case of OptForSize
4099   case CM_ScalarEpilogueNotAllowedOptSize:
4100     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4101       LLVM_DEBUG(
4102           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4103     else
4104       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4105                         << "count.\n");
4106 
4107     // Bail if runtime checks are required, which are not good when optimising
4108     // for size.
4109     if (runtimeChecksRequired())
4110       return FixedScalableVFPair::getNone();
4111 
4112     break;
4113   }
4114 
4115   // The only loops we can vectorize without a scalar epilogue, are loops with
4116   // a bottom-test and a single exiting block. We'd have to handle the fact
4117   // that not every instruction executes on the last iteration.  This will
4118   // require a lane mask which varies through the vector loop body.  (TODO)
4119   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4120     // If there was a tail-folding hint/switch, but we can't fold the tail by
4121     // masking, fallback to a vectorization with a scalar epilogue.
4122     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4123       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4124                            "scalar epilogue instead.\n");
4125       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4126       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4127     }
4128     return FixedScalableVFPair::getNone();
4129   }
4130 
4131   // Now try the tail folding
4132 
4133   // Invalidate interleave groups that require an epilogue if we can't mask
4134   // the interleave-group.
4135   if (!useMaskedInterleavedAccesses(TTI)) {
4136     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4137            "No decisions should have been taken at this point");
4138     // Note: There is no need to invalidate any cost modeling decisions here, as
4139     // none were taken so far.
4140     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4141   }
4142 
4143   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4144 
4145   // Avoid tail folding if the trip count is known to be a multiple of any VF
4146   // we choose.
4147   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4148       MaxFactors.FixedVF.getFixedValue();
4149   if (MaxFactors.ScalableVF) {
4150     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4151     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4152       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4153           *MaxPowerOf2RuntimeVF,
4154           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4155     } else
4156       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4157   }
4158 
4159   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4160     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4161            "MaxFixedVF must be a power of 2");
4162     unsigned MaxVFtimesIC =
4163         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4164     ScalarEvolution *SE = PSE.getSE();
4165     // Currently only loops with countable exits are vectorized, but calling
4166     // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4167     // uncountable exits whilst also ensuring the symbolic maximum and known
4168     // back-edge taken count remain identical for loops with countable exits.
4169     const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4170     assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4171            "Invalid loop count");
4172     const SCEV *ExitCount = SE->getAddExpr(
4173         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4174     const SCEV *Rem = SE->getURemExpr(
4175         SE->applyLoopGuards(ExitCount, TheLoop),
4176         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4177     if (Rem->isZero()) {
4178       // Accept MaxFixedVF if we do not have a tail.
4179       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4180       return MaxFactors;
4181     }
4182   }
4183 
4184   // If we don't know the precise trip count, or if the trip count that we
4185   // found modulo the vectorization factor is not zero, try to fold the tail
4186   // by masking.
4187   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4188   setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4189   if (foldTailByMasking()) {
4190     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4191       LLVM_DEBUG(
4192           dbgs()
4193           << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4194              "try to generate VP Intrinsics with scalable vector "
4195              "factors only.\n");
4196       // Tail folded loop using VP intrinsics restricts the VF to be scalable
4197       // for now.
4198       // TODO: extend it for fixed vectors, if required.
4199       assert(MaxFactors.ScalableVF.isScalable() &&
4200              "Expected scalable vector factor.");
4201 
4202       MaxFactors.FixedVF = ElementCount::getFixed(1);
4203     }
4204     return MaxFactors;
4205   }
4206 
4207   // If there was a tail-folding hint/switch, but we can't fold the tail by
4208   // masking, fallback to a vectorization with a scalar epilogue.
4209   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4210     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4211                          "scalar epilogue instead.\n");
4212     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4213     return MaxFactors;
4214   }
4215 
4216   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4217     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4218     return FixedScalableVFPair::getNone();
4219   }
4220 
4221   if (TC == 0) {
4222     reportVectorizationFailure(
4223         "unable to calculate the loop count due to complex control flow",
4224         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4225     return FixedScalableVFPair::getNone();
4226   }
4227 
4228   reportVectorizationFailure(
4229       "Cannot optimize for size and vectorize at the same time.",
4230       "cannot optimize for size and vectorize at the same time. "
4231       "Enable vectorization of this loop with '#pragma clang loop "
4232       "vectorize(enable)' when compiling with -Os/-Oz",
4233       "NoTailLoopWithOptForSize", ORE, TheLoop);
4234   return FixedScalableVFPair::getNone();
4235 }
4236 
4237 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4238     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4239     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4240   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4241   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4242       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4243                            : TargetTransformInfo::RGK_FixedWidthVector);
4244 
4245   // Convenience function to return the minimum of two ElementCounts.
4246   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4247     assert((LHS.isScalable() == RHS.isScalable()) &&
4248            "Scalable flags must match");
4249     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4250   };
4251 
4252   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4253   // Note that both WidestRegister and WidestType may not be a powers of 2.
4254   auto MaxVectorElementCount = ElementCount::get(
4255       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4256       ComputeScalableMaxVF);
4257   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4258   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4259                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4260 
4261   if (!MaxVectorElementCount) {
4262     LLVM_DEBUG(dbgs() << "LV: The target has no "
4263                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4264                       << " vector registers.\n");
4265     return ElementCount::getFixed(1);
4266   }
4267 
4268   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4269   if (MaxVectorElementCount.isScalable() &&
4270       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4271     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4272     auto Min = Attr.getVScaleRangeMin();
4273     WidestRegisterMinEC *= Min;
4274   }
4275 
4276   // When a scalar epilogue is required, at least one iteration of the scalar
4277   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4278   // max VF that results in a dead vector loop.
4279   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4280     MaxTripCount -= 1;
4281 
4282   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4283       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4284     // If upper bound loop trip count (TC) is known at compile time there is no
4285     // point in choosing VF greater than TC (as done in the loop below). Select
4286     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4287     // scalable, we only fall back on a fixed VF when the TC is less than or
4288     // equal to the known number of lanes.
4289     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4290     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4291                          "exceeding the constant trip count: "
4292                       << ClampedUpperTripCount << "\n");
4293     return ElementCount::get(
4294         ClampedUpperTripCount,
4295         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4296   }
4297 
4298   TargetTransformInfo::RegisterKind RegKind =
4299       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4300                            : TargetTransformInfo::RGK_FixedWidthVector;
4301   ElementCount MaxVF = MaxVectorElementCount;
4302   if (MaximizeBandwidth ||
4303       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4304        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4305         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4306     auto MaxVectorElementCountMaxBW = ElementCount::get(
4307         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4308         ComputeScalableMaxVF);
4309     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4310 
4311     // Collect all viable vectorization factors larger than the default MaxVF
4312     // (i.e. MaxVectorElementCount).
4313     SmallVector<ElementCount, 8> VFs;
4314     for (ElementCount VS = MaxVectorElementCount * 2;
4315          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4316       VFs.push_back(VS);
4317 
4318     // For each VF calculate its register usage.
4319     auto RUs = calculateRegisterUsage(VFs);
4320 
4321     // Select the largest VF which doesn't require more registers than existing
4322     // ones.
4323     for (int I = RUs.size() - 1; I >= 0; --I) {
4324       const auto &MLU = RUs[I].MaxLocalUsers;
4325       if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4326             return LU.second <= TTI.getNumberOfRegisters(LU.first);
4327           })) {
4328         MaxVF = VFs[I];
4329         break;
4330       }
4331     }
4332     if (ElementCount MinVF =
4333             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4334       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4335         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4336                           << ") with target's minimum: " << MinVF << '\n');
4337         MaxVF = MinVF;
4338       }
4339     }
4340 
4341     // Invalidate any widening decisions we might have made, in case the loop
4342     // requires prediction (decided later), but we have already made some
4343     // load/store widening decisions.
4344     invalidateCostModelingDecisions();
4345   }
4346   return MaxVF;
4347 }
4348 
4349 /// Convenience function that returns the value of vscale_range iff
4350 /// vscale_range.min == vscale_range.max or otherwise returns the value
4351 /// returned by the corresponding TTI method.
4352 static std::optional<unsigned>
4353 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4354   const Function *Fn = L->getHeader()->getParent();
4355   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4356     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4357     auto Min = Attr.getVScaleRangeMin();
4358     auto Max = Attr.getVScaleRangeMax();
4359     if (Max && Min == Max)
4360       return Max;
4361   }
4362 
4363   return TTI.getVScaleForTuning();
4364 }
4365 
4366 /// This function attempts to return a value that represents the vectorization
4367 /// factor at runtime. For fixed-width VFs we know this precisely at compile
4368 /// time, but for scalable VFs we calculate it based on an estimate of the
4369 /// vscale value.
4370 static unsigned getEstimatedRuntimeVF(const Loop *L,
4371                                       const TargetTransformInfo &TTI,
4372                                       ElementCount VF) {
4373   unsigned EstimatedVF = VF.getKnownMinValue();
4374   if (VF.isScalable())
4375     if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4376       EstimatedVF *= *VScale;
4377   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4378   return EstimatedVF;
4379 }
4380 
4381 bool LoopVectorizationPlanner::isMoreProfitable(
4382     const VectorizationFactor &A, const VectorizationFactor &B,
4383     const unsigned MaxTripCount) const {
4384   InstructionCost CostA = A.Cost;
4385   InstructionCost CostB = B.Cost;
4386 
4387   // Improve estimate for the vector width if it is scalable.
4388   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4389   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4390   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4391     if (A.Width.isScalable())
4392       EstimatedWidthA *= *VScale;
4393     if (B.Width.isScalable())
4394       EstimatedWidthB *= *VScale;
4395   }
4396 
4397   // Assume vscale may be larger than 1 (or the value being tuned for),
4398   // so that scalable vectorization is slightly favorable over fixed-width
4399   // vectorization.
4400   bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4401                         A.Width.isScalable() && !B.Width.isScalable();
4402 
4403   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4404                                 const InstructionCost &RHS) {
4405     return PreferScalable ? LHS <= RHS : LHS < RHS;
4406   };
4407 
4408   // To avoid the need for FP division:
4409   //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4410   // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4411   if (!MaxTripCount)
4412     return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4413 
4414   auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4415                                            InstructionCost VectorCost,
4416                                            InstructionCost ScalarCost) {
4417     // If the trip count is a known (possibly small) constant, the trip count
4418     // will be rounded up to an integer number of iterations under
4419     // FoldTailByMasking. The total cost in that case will be
4420     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4421     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4422     // some extra overheads, but for the purpose of comparing the costs of
4423     // different VFs we can use this to compare the total loop-body cost
4424     // expected after vectorization.
4425     if (CM.foldTailByMasking())
4426       return VectorCost * divideCeil(MaxTripCount, VF);
4427     return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4428   };
4429 
4430   auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4431   auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4432   return CmpFn(RTCostA, RTCostB);
4433 }
4434 
4435 bool LoopVectorizationPlanner::isMoreProfitable(
4436     const VectorizationFactor &A, const VectorizationFactor &B) const {
4437   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4438   return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4439 }
4440 
4441 void LoopVectorizationPlanner::emitInvalidCostRemarks(
4442     OptimizationRemarkEmitter *ORE) {
4443   using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4444   SmallVector<RecipeVFPair> InvalidCosts;
4445   for (const auto &Plan : VPlans) {
4446     for (ElementCount VF : Plan->vectorFactors()) {
4447       VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4448                             CM);
4449       precomputeCosts(*Plan, VF, CostCtx);
4450       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4451       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4452         for (auto &R : *VPBB) {
4453           if (!R.cost(VF, CostCtx).isValid())
4454             InvalidCosts.emplace_back(&R, VF);
4455         }
4456       }
4457     }
4458   }
4459   if (InvalidCosts.empty())
4460     return;
4461 
4462   // Emit a report of VFs with invalid costs in the loop.
4463 
4464   // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4465   DenseMap<VPRecipeBase *, unsigned> Numbering;
4466   unsigned I = 0;
4467   for (auto &Pair : InvalidCosts)
4468     if (!Numbering.count(Pair.first))
4469       Numbering[Pair.first] = I++;
4470 
4471   // Sort the list, first on recipe(number) then on VF.
4472   sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4473     if (Numbering[A.first] != Numbering[B.first])
4474       return Numbering[A.first] < Numbering[B.first];
4475     const auto &LHS = A.second;
4476     const auto &RHS = B.second;
4477     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4478            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4479   });
4480 
4481   // For a list of ordered recipe-VF pairs:
4482   //   [(load, VF1), (load, VF2), (store, VF1)]
4483   // group the recipes together to emit separate remarks for:
4484   //   load  (VF1, VF2)
4485   //   store (VF1)
4486   auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4487   auto Subset = ArrayRef<RecipeVFPair>();
4488   do {
4489     if (Subset.empty())
4490       Subset = Tail.take_front(1);
4491 
4492     VPRecipeBase *R = Subset.front().first;
4493 
4494     unsigned Opcode =
4495         TypeSwitch<const VPRecipeBase *, unsigned>(R)
4496             .Case<VPHeaderPHIRecipe>(
4497                 [](const auto *R) { return Instruction::PHI; })
4498             .Case<VPWidenSelectRecipe>(
4499                 [](const auto *R) { return Instruction::Select; })
4500             .Case<VPWidenStoreRecipe>(
4501                 [](const auto *R) { return Instruction::Store; })
4502             .Case<VPWidenLoadRecipe>(
4503                 [](const auto *R) { return Instruction::Load; })
4504             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4505                 [](const auto *R) { return Instruction::Call; })
4506             .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4507                   VPWidenCastRecipe>(
4508                 [](const auto *R) { return R->getOpcode(); })
4509             .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4510               return R->getStoredValues().empty() ? Instruction::Load
4511                                                   : Instruction::Store;
4512             });
4513 
4514     // If the next recipe is different, or if there are no other pairs,
4515     // emit a remark for the collated subset. e.g.
4516     //   [(load, VF1), (load, VF2))]
4517     // to emit:
4518     //  remark: invalid costs for 'load' at VF=(VF1, VF2)
4519     if (Subset == Tail || Tail[Subset.size()].first != R) {
4520       std::string OutString;
4521       raw_string_ostream OS(OutString);
4522       assert(!Subset.empty() && "Unexpected empty range");
4523       OS << "Recipe with invalid costs prevented vectorization at VF=(";
4524       for (const auto &Pair : Subset)
4525         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4526       OS << "):";
4527       if (Opcode == Instruction::Call) {
4528         StringRef Name = "";
4529         if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4530           Name = Int->getIntrinsicName();
4531         } else {
4532           auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4533           Function *CalledFn =
4534               WidenCall ? WidenCall->getCalledScalarFunction()
4535                         : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4536                                              ->getLiveInIRValue());
4537           Name = CalledFn->getName();
4538         }
4539         OS << " call to " << Name;
4540       } else
4541         OS << " " << Instruction::getOpcodeName(Opcode);
4542       reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4543                               R->getDebugLoc());
4544       Tail = Tail.drop_front(Subset.size());
4545       Subset = {};
4546     } else
4547       // Grow the subset by one element
4548       Subset = Tail.take_front(Subset.size() + 1);
4549   } while (!Tail.empty());
4550 }
4551 
4552 /// Check if any recipe of \p Plan will generate a vector value, which will be
4553 /// assigned a vector register.
4554 static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4555                                 const TargetTransformInfo &TTI) {
4556   assert(VF.isVector() && "Checking a scalar VF?");
4557   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4558   DenseSet<VPRecipeBase *> EphemeralRecipes;
4559   collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4560   // Set of already visited types.
4561   DenseSet<Type *> Visited;
4562   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4563            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4564     for (VPRecipeBase &R : *VPBB) {
4565       if (EphemeralRecipes.contains(&R))
4566         continue;
4567       // Continue early if the recipe is considered to not produce a vector
4568       // result. Note that this includes VPInstruction where some opcodes may
4569       // produce a vector, to preserve existing behavior as VPInstructions model
4570       // aspects not directly mapped to existing IR instructions.
4571       switch (R.getVPDefID()) {
4572       case VPDef::VPDerivedIVSC:
4573       case VPDef::VPScalarIVStepsSC:
4574       case VPDef::VPScalarCastSC:
4575       case VPDef::VPReplicateSC:
4576       case VPDef::VPInstructionSC:
4577       case VPDef::VPCanonicalIVPHISC:
4578       case VPDef::VPVectorPointerSC:
4579       case VPDef::VPReverseVectorPointerSC:
4580       case VPDef::VPExpandSCEVSC:
4581       case VPDef::VPEVLBasedIVPHISC:
4582       case VPDef::VPPredInstPHISC:
4583       case VPDef::VPBranchOnMaskSC:
4584         continue;
4585       case VPDef::VPReductionSC:
4586       case VPDef::VPActiveLaneMaskPHISC:
4587       case VPDef::VPWidenCallSC:
4588       case VPDef::VPWidenCanonicalIVSC:
4589       case VPDef::VPWidenCastSC:
4590       case VPDef::VPWidenGEPSC:
4591       case VPDef::VPWidenIntrinsicSC:
4592       case VPDef::VPWidenSC:
4593       case VPDef::VPWidenSelectSC:
4594       case VPDef::VPBlendSC:
4595       case VPDef::VPFirstOrderRecurrencePHISC:
4596       case VPDef::VPWidenPHISC:
4597       case VPDef::VPWidenIntOrFpInductionSC:
4598       case VPDef::VPWidenPointerInductionSC:
4599       case VPDef::VPReductionPHISC:
4600       case VPDef::VPInterleaveSC:
4601       case VPDef::VPWidenLoadEVLSC:
4602       case VPDef::VPWidenLoadSC:
4603       case VPDef::VPWidenStoreEVLSC:
4604       case VPDef::VPWidenStoreSC:
4605         break;
4606       default:
4607         llvm_unreachable("unhandled recipe");
4608       }
4609 
4610       auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4611         Type *VectorTy = toVectorTy(ScalarTy, VF);
4612         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4613         if (!NumLegalParts)
4614           return false;
4615         if (VF.isScalable()) {
4616           // <vscale x 1 x iN> is assumed to be profitable over iN because
4617           // scalable registers are a distinct register class from scalar
4618           // ones. If we ever find a target which wants to lower scalable
4619           // vectors back to scalars, we'll need to update this code to
4620           // explicitly ask TTI about the register class uses for each part.
4621           return NumLegalParts <= VF.getKnownMinValue();
4622         }
4623         // Two or more parts that share a register - are vectorized.
4624         return NumLegalParts < VF.getKnownMinValue();
4625       };
4626 
4627       // If no def nor is a store, e.g., branches, continue - no value to check.
4628       if (R.getNumDefinedValues() == 0 &&
4629           !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4630               &R))
4631         continue;
4632       // For multi-def recipes, currently only interleaved loads, suffice to
4633       // check first def only.
4634       // For stores check their stored value; for interleaved stores suffice
4635       // the check first stored value only. In all cases this is the second
4636       // operand.
4637       VPValue *ToCheck =
4638           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4639       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4640       if (!Visited.insert({ScalarTy}).second)
4641         continue;
4642       if (WillWiden(ScalarTy))
4643         return true;
4644     }
4645   }
4646 
4647   return false;
4648 }
4649 
4650 #ifndef NDEBUG
4651 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4652   InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4653   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4654   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4655   assert(any_of(VPlans,
4656                 [](std::unique_ptr<VPlan> &P) {
4657                   return P->hasVF(ElementCount::getFixed(1));
4658                 }) &&
4659          "Expected Scalar VF to be a candidate");
4660 
4661   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4662                                        ExpectedCost);
4663   VectorizationFactor ChosenFactor = ScalarCost;
4664 
4665   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4666   if (ForceVectorization &&
4667       (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4668     // Ignore scalar width, because the user explicitly wants vectorization.
4669     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4670     // evaluation.
4671     ChosenFactor.Cost = InstructionCost::getMax();
4672   }
4673 
4674   for (auto &P : VPlans) {
4675     for (ElementCount VF : P->vectorFactors()) {
4676       // The cost for scalar VF=1 is already calculated, so ignore it.
4677       if (VF.isScalar())
4678         continue;
4679 
4680       InstructionCost C = CM.expectedCost(VF);
4681       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4682 
4683       unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4684       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4685                         << " costs: " << (Candidate.Cost / Width));
4686       if (VF.isScalable())
4687         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4688                           << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4689                           << ")");
4690       LLVM_DEBUG(dbgs() << ".\n");
4691 
4692       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4693         LLVM_DEBUG(
4694             dbgs()
4695             << "LV: Not considering vector loop of width " << VF
4696             << " because it will not generate any vector instructions.\n");
4697         continue;
4698       }
4699 
4700       if (isMoreProfitable(Candidate, ChosenFactor))
4701         ChosenFactor = Candidate;
4702     }
4703   }
4704 
4705   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4706     reportVectorizationFailure(
4707         "There are conditional stores.",
4708         "store that is conditionally executed prevents vectorization",
4709         "ConditionalStore", ORE, OrigLoop);
4710     ChosenFactor = ScalarCost;
4711   }
4712 
4713   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4714                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4715              << "LV: Vectorization seems to be not beneficial, "
4716              << "but was forced by a user.\n");
4717   return ChosenFactor;
4718 }
4719 #endif
4720 
4721 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4722     ElementCount VF) const {
4723   // Cross iteration phis such as reductions need special handling and are
4724   // currently unsupported.
4725   if (any_of(OrigLoop->getHeader()->phis(),
4726              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4727     return false;
4728 
4729   // Phis with uses outside of the loop require special handling and are
4730   // currently unsupported.
4731   for (const auto &Entry : Legal->getInductionVars()) {
4732     // Look for uses of the value of the induction at the last iteration.
4733     Value *PostInc =
4734         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4735     for (User *U : PostInc->users())
4736       if (!OrigLoop->contains(cast<Instruction>(U)))
4737         return false;
4738     // Look for uses of penultimate value of the induction.
4739     for (User *U : Entry.first->users())
4740       if (!OrigLoop->contains(cast<Instruction>(U)))
4741         return false;
4742   }
4743 
4744   // Epilogue vectorization code has not been auditted to ensure it handles
4745   // non-latch exits properly.  It may be fine, but it needs auditted and
4746   // tested.
4747   // TODO: Add support for loops with an early exit.
4748   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4749     return false;
4750 
4751   return true;
4752 }
4753 
4754 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4755     const ElementCount VF, const unsigned IC) const {
4756   // FIXME: We need a much better cost-model to take different parameters such
4757   // as register pressure, code size increase and cost of extra branches into
4758   // account. For now we apply a very crude heuristic and only consider loops
4759   // with vectorization factors larger than a certain value.
4760 
4761   // Allow the target to opt out entirely.
4762   if (!TTI.preferEpilogueVectorization())
4763     return false;
4764 
4765   // We also consider epilogue vectorization unprofitable for targets that don't
4766   // consider interleaving beneficial (eg. MVE).
4767   if (TTI.getMaxInterleaveFactor(VF) <= 1)
4768     return false;
4769 
4770   // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4771   // VFs when deciding profitability.
4772   // See related "TODO: extend to support scalable VFs." in
4773   // selectEpilogueVectorizationFactor.
4774   unsigned Multiplier = VF.isFixed() ? IC : 1;
4775   unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4776                                 ? EpilogueVectorizationMinVF
4777                                 : TTI.getEpilogueVectorizationMinVF();
4778   return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4779 }
4780 
4781 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4782     const ElementCount MainLoopVF, unsigned IC) {
4783   VectorizationFactor Result = VectorizationFactor::Disabled();
4784   if (!EnableEpilogueVectorization) {
4785     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4786     return Result;
4787   }
4788 
4789   if (!CM.isScalarEpilogueAllowed()) {
4790     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4791                          "epilogue is allowed.\n");
4792     return Result;
4793   }
4794 
4795   // Not really a cost consideration, but check for unsupported cases here to
4796   // simplify the logic.
4797   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4798     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4799                          "is not a supported candidate.\n");
4800     return Result;
4801   }
4802 
4803   if (EpilogueVectorizationForceVF > 1) {
4804     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4805     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4806     if (hasPlanWithVF(ForcedEC))
4807       return {ForcedEC, 0, 0};
4808 
4809     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4810                          "viable.\n");
4811     return Result;
4812   }
4813 
4814   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4815       OrigLoop->getHeader()->getParent()->hasMinSize()) {
4816     LLVM_DEBUG(
4817         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4818     return Result;
4819   }
4820 
4821   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4822     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4823                          "this loop\n");
4824     return Result;
4825   }
4826 
4827   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4828   // the main loop handles 8 lanes per iteration. We could still benefit from
4829   // vectorizing the epilogue loop with VF=4.
4830   ElementCount EstimatedRuntimeVF =
4831       ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4832 
4833   ScalarEvolution &SE = *PSE.getSE();
4834   Type *TCType = Legal->getWidestInductionType();
4835   const SCEV *RemainingIterations = nullptr;
4836   unsigned MaxTripCount = 0;
4837   for (auto &NextVF : ProfitableVFs) {
4838     // Skip candidate VFs without a corresponding VPlan.
4839     if (!hasPlanWithVF(NextVF.Width))
4840       continue;
4841 
4842     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4843     // vectors) or > the VF of the main loop (fixed vectors).
4844     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4845          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4846         (NextVF.Width.isScalable() &&
4847          ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4848         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4849          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4850       continue;
4851 
4852     // If NextVF is greater than the number of remaining iterations, the
4853     // epilogue loop would be dead. Skip such factors.
4854     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4855       // TODO: extend to support scalable VFs.
4856       if (!RemainingIterations) {
4857         const SCEV *TC = vputils::getSCEVExprForVPValue(
4858             getPlanFor(NextVF.Width).getTripCount(), SE);
4859         assert(!isa<SCEVCouldNotCompute>(TC) &&
4860                "Trip count SCEV must be computable");
4861         RemainingIterations = SE.getURemExpr(
4862             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4863         MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4864         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4865                                 SE.getConstant(TCType, MaxTripCount))) {
4866           MaxTripCount =
4867               SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4868         }
4869         LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4870                           << MaxTripCount << "\n");
4871       }
4872       if (SE.isKnownPredicate(
4873               CmpInst::ICMP_UGT,
4874               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4875               RemainingIterations))
4876         continue;
4877     }
4878 
4879     if (Result.Width.isScalar() ||
4880         isMoreProfitable(NextVF, Result, MaxTripCount))
4881       Result = NextVF;
4882   }
4883 
4884   if (Result != VectorizationFactor::Disabled())
4885     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4886                       << Result.Width << "\n");
4887   return Result;
4888 }
4889 
4890 std::pair<unsigned, unsigned>
4891 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4892   unsigned MinWidth = -1U;
4893   unsigned MaxWidth = 8;
4894   const DataLayout &DL = TheFunction->getDataLayout();
4895   // For in-loop reductions, no element types are added to ElementTypesInLoop
4896   // if there are no loads/stores in the loop. In this case, check through the
4897   // reduction variables to determine the maximum width.
4898   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4899     // Reset MaxWidth so that we can find the smallest type used by recurrences
4900     // in the loop.
4901     MaxWidth = -1U;
4902     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4903       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4904       // When finding the min width used by the recurrence we need to account
4905       // for casts on the input operands of the recurrence.
4906       MaxWidth = std::min<unsigned>(
4907           MaxWidth, std::min<unsigned>(
4908                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4909                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4910     }
4911   } else {
4912     for (Type *T : ElementTypesInLoop) {
4913       MinWidth = std::min<unsigned>(
4914           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4915       MaxWidth = std::max<unsigned>(
4916           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4917     }
4918   }
4919   return {MinWidth, MaxWidth};
4920 }
4921 
4922 void LoopVectorizationCostModel::collectElementTypesForWidening() {
4923   ElementTypesInLoop.clear();
4924   // For each block.
4925   for (BasicBlock *BB : TheLoop->blocks()) {
4926     // For each instruction in the loop.
4927     for (Instruction &I : BB->instructionsWithoutDebug()) {
4928       Type *T = I.getType();
4929 
4930       // Skip ignored values.
4931       if (ValuesToIgnore.count(&I))
4932         continue;
4933 
4934       // Only examine Loads, Stores and PHINodes.
4935       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4936         continue;
4937 
4938       // Examine PHI nodes that are reduction variables. Update the type to
4939       // account for the recurrence type.
4940       if (auto *PN = dyn_cast<PHINode>(&I)) {
4941         if (!Legal->isReductionVariable(PN))
4942           continue;
4943         const RecurrenceDescriptor &RdxDesc =
4944             Legal->getReductionVars().find(PN)->second;
4945         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4946             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
4947                                       RdxDesc.getRecurrenceType(),
4948                                       TargetTransformInfo::ReductionFlags()))
4949           continue;
4950         T = RdxDesc.getRecurrenceType();
4951       }
4952 
4953       // Examine the stored values.
4954       if (auto *ST = dyn_cast<StoreInst>(&I))
4955         T = ST->getValueOperand()->getType();
4956 
4957       assert(T->isSized() &&
4958              "Expected the load/store/recurrence type to be sized");
4959 
4960       ElementTypesInLoop.insert(T);
4961     }
4962   }
4963 }
4964 
4965 unsigned
4966 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4967                                                   InstructionCost LoopCost) {
4968   // -- The interleave heuristics --
4969   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4970   // There are many micro-architectural considerations that we can't predict
4971   // at this level. For example, frontend pressure (on decode or fetch) due to
4972   // code size, or the number and capabilities of the execution ports.
4973   //
4974   // We use the following heuristics to select the interleave count:
4975   // 1. If the code has reductions, then we interleave to break the cross
4976   // iteration dependency.
4977   // 2. If the loop is really small, then we interleave to reduce the loop
4978   // overhead.
4979   // 3. We don't interleave if we think that we will spill registers to memory
4980   // due to the increased register pressure.
4981 
4982   if (!isScalarEpilogueAllowed())
4983     return 1;
4984 
4985   // Do not interleave if EVL is preferred and no User IC is specified.
4986   if (foldTailWithEVL()) {
4987     LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4988                          "Unroll factor forced to be 1.\n");
4989     return 1;
4990   }
4991 
4992   // We used the distance for the interleave count.
4993   if (!Legal->isSafeForAnyVectorWidth())
4994     return 1;
4995 
4996   // We don't attempt to perform interleaving for loops with uncountable early
4997   // exits because the VPInstruction::AnyOf code cannot currently handle
4998   // multiple parts.
4999   if (Legal->hasUncountableEarlyExit())
5000     return 1;
5001 
5002   auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
5003   const bool HasReductions = !Legal->getReductionVars().empty();
5004 
5005   // If we did not calculate the cost for VF (because the user selected the VF)
5006   // then we calculate the cost of VF here.
5007   if (LoopCost == 0) {
5008     LoopCost = expectedCost(VF);
5009     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5010 
5011     // Loop body is free and there is no need for interleaving.
5012     if (LoopCost == 0)
5013       return 1;
5014   }
5015 
5016   RegisterUsage R = calculateRegisterUsage({VF})[0];
5017   // We divide by these constants so assume that we have at least one
5018   // instruction that uses at least one register.
5019   for (auto &Pair : R.MaxLocalUsers) {
5020     Pair.second = std::max(Pair.second, 1U);
5021   }
5022 
5023   // We calculate the interleave count using the following formula.
5024   // Subtract the number of loop invariants from the number of available
5025   // registers. These registers are used by all of the interleaved instances.
5026   // Next, divide the remaining registers by the number of registers that is
5027   // required by the loop, in order to estimate how many parallel instances
5028   // fit without causing spills. All of this is rounded down if necessary to be
5029   // a power of two. We want power of two interleave count to simplify any
5030   // addressing operations or alignment considerations.
5031   // We also want power of two interleave counts to ensure that the induction
5032   // variable of the vector loop wraps to zero, when tail is folded by masking;
5033   // this currently happens when OptForSize, in which case IC is set to 1 above.
5034   unsigned IC = UINT_MAX;
5035 
5036   for (const auto &Pair : R.MaxLocalUsers) {
5037     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
5038     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5039                       << " registers of "
5040                       << TTI.getRegisterClassName(Pair.first)
5041                       << " register class\n");
5042     if (VF.isScalar()) {
5043       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5044         TargetNumRegisters = ForceTargetNumScalarRegs;
5045     } else {
5046       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5047         TargetNumRegisters = ForceTargetNumVectorRegs;
5048     }
5049     unsigned MaxLocalUsers = Pair.second;
5050     unsigned LoopInvariantRegs = 0;
5051     if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
5052       LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
5053 
5054     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5055                                      MaxLocalUsers);
5056     // Don't count the induction variable as interleaved.
5057     if (EnableIndVarRegisterHeur) {
5058       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5059                               std::max(1U, (MaxLocalUsers - 1)));
5060     }
5061 
5062     IC = std::min(IC, TmpIC);
5063   }
5064 
5065   // Clamp the interleave ranges to reasonable counts.
5066   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5067 
5068   // Check if the user has overridden the max.
5069   if (VF.isScalar()) {
5070     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5071       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5072   } else {
5073     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5074       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5075   }
5076 
5077   unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
5078   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5079   if (KnownTC > 0) {
5080     // At least one iteration must be scalar when this constraint holds. So the
5081     // maximum available iterations for interleaving is one less.
5082     unsigned AvailableTC =
5083         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5084 
5085     // If trip count is known we select between two prospective ICs, where
5086     // 1) the aggressive IC is capped by the trip count divided by VF
5087     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5088     // The final IC is selected in a way that the epilogue loop trip count is
5089     // minimized while maximizing the IC itself, so that we either run the
5090     // vector loop at least once if it generates a small epilogue loop, or else
5091     // we run the vector loop at least twice.
5092 
5093     unsigned InterleaveCountUB = bit_floor(
5094         std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5095     unsigned InterleaveCountLB = bit_floor(std::max(
5096         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5097     MaxInterleaveCount = InterleaveCountLB;
5098 
5099     if (InterleaveCountUB != InterleaveCountLB) {
5100       unsigned TailTripCountUB =
5101           (AvailableTC % (EstimatedVF * InterleaveCountUB));
5102       unsigned TailTripCountLB =
5103           (AvailableTC % (EstimatedVF * InterleaveCountLB));
5104       // If both produce same scalar tail, maximize the IC to do the same work
5105       // in fewer vector loop iterations
5106       if (TailTripCountUB == TailTripCountLB)
5107         MaxInterleaveCount = InterleaveCountUB;
5108     }
5109   } else if (BestKnownTC && *BestKnownTC > 0) {
5110     // At least one iteration must be scalar when this constraint holds. So the
5111     // maximum available iterations for interleaving is one less.
5112     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5113                                ? (*BestKnownTC) - 1
5114                                : *BestKnownTC;
5115 
5116     // If trip count is an estimated compile time constant, limit the
5117     // IC to be capped by the trip count divided by VF * 2, such that the vector
5118     // loop runs at least twice to make interleaving seem profitable when there
5119     // is an epilogue loop present. Since exact Trip count is not known we
5120     // choose to be conservative in our IC estimate.
5121     MaxInterleaveCount = bit_floor(std::max(
5122         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5123   }
5124 
5125   assert(MaxInterleaveCount > 0 &&
5126          "Maximum interleave count must be greater than 0");
5127 
5128   // Clamp the calculated IC to be between the 1 and the max interleave count
5129   // that the target and trip count allows.
5130   if (IC > MaxInterleaveCount)
5131     IC = MaxInterleaveCount;
5132   else
5133     // Make sure IC is greater than 0.
5134     IC = std::max(1u, IC);
5135 
5136   assert(IC > 0 && "Interleave count must be greater than 0.");
5137 
5138   // Interleave if we vectorized this loop and there is a reduction that could
5139   // benefit from interleaving.
5140   if (VF.isVector() && HasReductions) {
5141     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5142     return IC;
5143   }
5144 
5145   // For any scalar loop that either requires runtime checks or predication we
5146   // are better off leaving this to the unroller. Note that if we've already
5147   // vectorized the loop we will have done the runtime check and so interleaving
5148   // won't require further checks.
5149   bool ScalarInterleavingRequiresPredication =
5150       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5151          return Legal->blockNeedsPredication(BB);
5152        }));
5153   bool ScalarInterleavingRequiresRuntimePointerCheck =
5154       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5155 
5156   // We want to interleave small loops in order to reduce the loop overhead and
5157   // potentially expose ILP opportunities.
5158   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5159                     << "LV: IC is " << IC << '\n'
5160                     << "LV: VF is " << VF << '\n');
5161   const bool AggressivelyInterleaveReductions =
5162       TTI.enableAggressiveInterleaving(HasReductions);
5163   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5164       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5165     // We assume that the cost overhead is 1 and we use the cost model
5166     // to estimate the cost of the loop and interleave until the cost of the
5167     // loop overhead is about 5% of the cost of the loop.
5168     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5169                                         SmallLoopCost / *LoopCost.getValue()));
5170 
5171     // Interleave until store/load ports (estimated by max interleave count) are
5172     // saturated.
5173     unsigned NumStores = Legal->getNumStores();
5174     unsigned NumLoads = Legal->getNumLoads();
5175     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5176     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5177 
5178     // There is little point in interleaving for reductions containing selects
5179     // and compares when VF=1 since it may just create more overhead than it's
5180     // worth for loops with small trip counts. This is because we still have to
5181     // do the final reduction after the loop.
5182     bool HasSelectCmpReductions =
5183         HasReductions &&
5184         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5185           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5186           RecurKind RK = RdxDesc.getRecurrenceKind();
5187           return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5188                  RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5189         });
5190     if (HasSelectCmpReductions) {
5191       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5192       return 1;
5193     }
5194 
5195     // If we have a scalar reduction (vector reductions are already dealt with
5196     // by this point), we can increase the critical path length if the loop
5197     // we're interleaving is inside another loop. For tree-wise reductions
5198     // set the limit to 2, and for ordered reductions it's best to disable
5199     // interleaving entirely.
5200     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5201       bool HasOrderedReductions =
5202           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5203             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5204             return RdxDesc.isOrdered();
5205           });
5206       if (HasOrderedReductions) {
5207         LLVM_DEBUG(
5208             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5209         return 1;
5210       }
5211 
5212       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5213       SmallIC = std::min(SmallIC, F);
5214       StoresIC = std::min(StoresIC, F);
5215       LoadsIC = std::min(LoadsIC, F);
5216     }
5217 
5218     if (EnableLoadStoreRuntimeInterleave &&
5219         std::max(StoresIC, LoadsIC) > SmallIC) {
5220       LLVM_DEBUG(
5221           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5222       return std::max(StoresIC, LoadsIC);
5223     }
5224 
5225     // If there are scalar reductions and TTI has enabled aggressive
5226     // interleaving for reductions, we will interleave to expose ILP.
5227     if (VF.isScalar() && AggressivelyInterleaveReductions) {
5228       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5229       // Interleave no less than SmallIC but not as aggressive as the normal IC
5230       // to satisfy the rare situation when resources are too limited.
5231       return std::max(IC / 2, SmallIC);
5232     }
5233 
5234     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5235     return SmallIC;
5236   }
5237 
5238   // Interleave if this is a large loop (small loops are already dealt with by
5239   // this point) that could benefit from interleaving.
5240   if (AggressivelyInterleaveReductions) {
5241     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5242     return IC;
5243   }
5244 
5245   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5246   return 1;
5247 }
5248 
5249 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5250 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5251   // This function calculates the register usage by measuring the highest number
5252   // of values that are alive at a single location. Obviously, this is a very
5253   // rough estimation. We scan the loop in a topological order in order and
5254   // assign a number to each instruction. We use RPO to ensure that defs are
5255   // met before their users. We assume that each instruction that has in-loop
5256   // users starts an interval. We record every time that an in-loop value is
5257   // used, so we have a list of the first and last occurrences of each
5258   // instruction. Next, we transpose this data structure into a multi map that
5259   // holds the list of intervals that *end* at a specific location. This multi
5260   // map allows us to perform a linear search. We scan the instructions linearly
5261   // and record each time that a new interval starts, by placing it in a set.
5262   // If we find this value in the multi-map then we remove it from the set.
5263   // The max register usage is the maximum size of the set.
5264   // We also search for instructions that are defined outside the loop, but are
5265   // used inside the loop. We need this number separately from the max-interval
5266   // usage number because when we unroll, loop-invariant values do not take
5267   // more register.
5268   LoopBlocksDFS DFS(TheLoop);
5269   DFS.perform(LI);
5270 
5271   RegisterUsage RU;
5272 
5273   // Each 'key' in the map opens a new interval. The values
5274   // of the map are the index of the 'last seen' usage of the
5275   // instruction that is the key.
5276   using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;
5277 
5278   // Maps instruction to its index.
5279   SmallVector<Instruction *, 64> IdxToInstr;
5280   // Marks the end of each interval.
5281   IntervalMap EndPoint;
5282   // Saves the list of instruction indices that are used in the loop.
5283   SmallPtrSet<Instruction *, 8> Ends;
5284   // Saves the list of values that are used in the loop but are defined outside
5285   // the loop (not including non-instruction values such as arguments and
5286   // constants).
5287   SmallSetVector<Instruction *, 8> LoopInvariants;
5288 
5289   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5290     for (Instruction &I : BB->instructionsWithoutDebug()) {
5291       IdxToInstr.push_back(&I);
5292 
5293       // Save the end location of each USE.
5294       for (Value *U : I.operands()) {
5295         auto *Instr = dyn_cast<Instruction>(U);
5296 
5297         // Ignore non-instruction values such as arguments, constants, etc.
5298         // FIXME: Might need some motivation why these values are ignored. If
5299         // for example an argument is used inside the loop it will increase the
5300         // register pressure (so shouldn't we add it to LoopInvariants).
5301         if (!Instr)
5302           continue;
5303 
5304         // If this instruction is outside the loop then record it and continue.
5305         if (!TheLoop->contains(Instr)) {
5306           LoopInvariants.insert(Instr);
5307           continue;
5308         }
5309 
5310         // Overwrite previous end points.
5311         EndPoint[Instr] = IdxToInstr.size();
5312         Ends.insert(Instr);
5313       }
5314     }
5315   }
5316 
5317   // Saves the list of intervals that end with the index in 'key'.
5318   using InstrList = SmallVector<Instruction *, 2>;
5319   SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;
5320 
5321   // Transpose the EndPoints to a list of values that end at each index.
5322   for (auto &Interval : EndPoint)
5323     TransposeEnds[Interval.second].push_back(Interval.first);
5324 
5325   SmallPtrSet<Instruction *, 8> OpenIntervals;
5326   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5327   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5328 
5329   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5330 
5331   const auto &TTICapture = TTI;
5332   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5333     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5334         (VF.isScalable() &&
5335          !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5336       return 0;
5337     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5338   };
5339 
5340   for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5341     Instruction *I = IdxToInstr[Idx];
5342 
5343     // Remove all of the instructions that end at this location.
5344     InstrList &List = TransposeEnds[Idx];
5345     for (Instruction *ToRemove : List)
5346       OpenIntervals.erase(ToRemove);
5347 
5348     // Ignore instructions that are never used within the loop.
5349     if (!Ends.count(I))
5350       continue;
5351 
5352     // Skip ignored values.
5353     if (ValuesToIgnore.count(I))
5354       continue;
5355 
5356     collectInLoopReductions();
5357 
5358     // For each VF find the maximum usage of registers.
5359     for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5360       // Count the number of registers used, per register class, given all open
5361       // intervals.
5362       // Note that elements in this SmallMapVector will be default constructed
5363       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5364       // there is no previous entry for ClassID.
5365       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5366 
5367       if (VFs[J].isScalar()) {
5368         for (auto *Inst : OpenIntervals) {
5369           unsigned ClassID =
5370               TTI.getRegisterClassForType(false, Inst->getType());
5371           // FIXME: The target might use more than one register for the type
5372           // even in the scalar case.
5373           RegUsage[ClassID] += 1;
5374         }
5375       } else {
5376         collectUniformsAndScalars(VFs[J]);
5377         for (auto *Inst : OpenIntervals) {
5378           // Skip ignored values for VF > 1.
5379           if (VecValuesToIgnore.count(Inst))
5380             continue;
5381           if (isScalarAfterVectorization(Inst, VFs[J])) {
5382             unsigned ClassID =
5383                 TTI.getRegisterClassForType(false, Inst->getType());
5384             // FIXME: The target might use more than one register for the type
5385             // even in the scalar case.
5386             RegUsage[ClassID] += 1;
5387           } else {
5388             unsigned ClassID =
5389                 TTI.getRegisterClassForType(true, Inst->getType());
5390             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5391           }
5392         }
5393       }
5394 
5395       for (const auto &Pair : RegUsage) {
5396         auto &Entry = MaxUsages[J][Pair.first];
5397         Entry = std::max(Entry, Pair.second);
5398       }
5399     }
5400 
5401     LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5402                       << OpenIntervals.size() << '\n');
5403 
5404     // Add the current instruction to the list of open intervals.
5405     OpenIntervals.insert(I);
5406   }
5407 
5408   for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5409     // Note that elements in this SmallMapVector will be default constructed
5410     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5411     // there is no previous entry for ClassID.
5412     SmallMapVector<unsigned, unsigned, 4> Invariant;
5413 
5414     for (auto *Inst : LoopInvariants) {
5415       // FIXME: The target might use more than one register for the type
5416       // even in the scalar case.
5417       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5418         auto *I = cast<Instruction>(U);
5419         return TheLoop != LI->getLoopFor(I->getParent()) ||
5420                isScalarAfterVectorization(I, VFs[Idx]);
5421       });
5422 
5423       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5424       unsigned ClassID =
5425           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5426       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5427     }
5428 
5429     LLVM_DEBUG({
5430       dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5431       dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5432              << " item\n";
5433       for (const auto &pair : MaxUsages[Idx]) {
5434         dbgs() << "LV(REG): RegisterClass: "
5435                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5436                << " registers\n";
5437       }
5438       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5439              << " item\n";
5440       for (const auto &pair : Invariant) {
5441         dbgs() << "LV(REG): RegisterClass: "
5442                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5443                << " registers\n";
5444       }
5445     });
5446 
5447     RU.LoopInvariantRegs = Invariant;
5448     RU.MaxLocalUsers = MaxUsages[Idx];
5449     RUs[Idx] = RU;
5450   }
5451 
5452   return RUs;
5453 }
5454 
5455 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5456                                                            ElementCount VF) {
5457   // TODO: Cost model for emulated masked load/store is completely
5458   // broken. This hack guides the cost model to use an artificially
5459   // high enough value to practically disable vectorization with such
5460   // operations, except where previously deployed legality hack allowed
5461   // using very low cost values. This is to avoid regressions coming simply
5462   // from moving "masked load/store" check from legality to cost model.
5463   // Masked Load/Gather emulation was previously never allowed.
5464   // Limited number of Masked Store/Scatter emulation was allowed.
5465   assert((isPredicatedInst(I)) &&
5466          "Expecting a scalar emulated instruction");
5467   return isa<LoadInst>(I) ||
5468          (isa<StoreInst>(I) &&
5469           NumPredStores > NumberOfStoresToPredicate);
5470 }
5471 
5472 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5473   // If we aren't vectorizing the loop, or if we've already collected the
5474   // instructions to scalarize, there's nothing to do. Collection may already
5475   // have occurred if we have a user-selected VF and are now computing the
5476   // expected cost for interleaving.
5477   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5478     return;
5479 
5480   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5481   // not profitable to scalarize any instructions, the presence of VF in the
5482   // map will indicate that we've analyzed it already.
5483   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5484 
5485   PredicatedBBsAfterVectorization[VF].clear();
5486 
5487   // Find all the instructions that are scalar with predication in the loop and
5488   // determine if it would be better to not if-convert the blocks they are in.
5489   // If so, we also record the instructions to scalarize.
5490   for (BasicBlock *BB : TheLoop->blocks()) {
5491     if (!blockNeedsPredicationForAnyReason(BB))
5492       continue;
5493     for (Instruction &I : *BB)
5494       if (isScalarWithPredication(&I, VF)) {
5495         ScalarCostsTy ScalarCosts;
5496         // Do not apply discount logic for:
5497         // 1. Scalars after vectorization, as there will only be a single copy
5498         // of the instruction.
5499         // 2. Scalable VF, as that would lead to invalid scalarization costs.
5500         // 3. Emulated masked memrefs, if a hacked cost is needed.
5501         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5502             !useEmulatedMaskMemRefHack(&I, VF) &&
5503             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5504           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5505           // Check if we decided to scalarize a call. If so, update the widening
5506           // decision of the call to CM_Scalarize with the computed scalar cost.
5507           for (const auto &[I, _] : ScalarCosts) {
5508             auto *CI = dyn_cast<CallInst>(I);
5509             if (!CI || !CallWideningDecisions.contains({CI, VF}))
5510               continue;
5511             CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5512             CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5513           }
5514         }
5515         // Remember that BB will remain after vectorization.
5516         PredicatedBBsAfterVectorization[VF].insert(BB);
5517         for (auto *Pred : predecessors(BB)) {
5518           if (Pred->getSingleSuccessor() == BB)
5519             PredicatedBBsAfterVectorization[VF].insert(Pred);
5520         }
5521       }
5522   }
5523 }
5524 
5525 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5526     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5527   assert(!isUniformAfterVectorization(PredInst, VF) &&
5528          "Instruction marked uniform-after-vectorization will be predicated");
5529 
5530   // Initialize the discount to zero, meaning that the scalar version and the
5531   // vector version cost the same.
5532   InstructionCost Discount = 0;
5533 
5534   // Holds instructions to analyze. The instructions we visit are mapped in
5535   // ScalarCosts. Those instructions are the ones that would be scalarized if
5536   // we find that the scalar version costs less.
5537   SmallVector<Instruction *, 8> Worklist;
5538 
5539   // Returns true if the given instruction can be scalarized.
5540   auto CanBeScalarized = [&](Instruction *I) -> bool {
5541     // We only attempt to scalarize instructions forming a single-use chain
5542     // from the original predicated block that would otherwise be vectorized.
5543     // Although not strictly necessary, we give up on instructions we know will
5544     // already be scalar to avoid traversing chains that are unlikely to be
5545     // beneficial.
5546     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5547         isScalarAfterVectorization(I, VF))
5548       return false;
5549 
5550     // If the instruction is scalar with predication, it will be analyzed
5551     // separately. We ignore it within the context of PredInst.
5552     if (isScalarWithPredication(I, VF))
5553       return false;
5554 
5555     // If any of the instruction's operands are uniform after vectorization,
5556     // the instruction cannot be scalarized. This prevents, for example, a
5557     // masked load from being scalarized.
5558     //
5559     // We assume we will only emit a value for lane zero of an instruction
5560     // marked uniform after vectorization, rather than VF identical values.
5561     // Thus, if we scalarize an instruction that uses a uniform, we would
5562     // create uses of values corresponding to the lanes we aren't emitting code
5563     // for. This behavior can be changed by allowing getScalarValue to clone
5564     // the lane zero values for uniforms rather than asserting.
5565     for (Use &U : I->operands())
5566       if (auto *J = dyn_cast<Instruction>(U.get()))
5567         if (isUniformAfterVectorization(J, VF))
5568           return false;
5569 
5570     // Otherwise, we can scalarize the instruction.
5571     return true;
5572   };
5573 
5574   // Compute the expected cost discount from scalarizing the entire expression
5575   // feeding the predicated instruction. We currently only consider expressions
5576   // that are single-use instruction chains.
5577   Worklist.push_back(PredInst);
5578   while (!Worklist.empty()) {
5579     Instruction *I = Worklist.pop_back_val();
5580 
5581     // If we've already analyzed the instruction, there's nothing to do.
5582     if (ScalarCosts.contains(I))
5583       continue;
5584 
5585     // Compute the cost of the vector instruction. Note that this cost already
5586     // includes the scalarization overhead of the predicated instruction.
5587     InstructionCost VectorCost = getInstructionCost(I, VF);
5588 
5589     // Compute the cost of the scalarized instruction. This cost is the cost of
5590     // the instruction as if it wasn't if-converted and instead remained in the
5591     // predicated block. We will scale this cost by block probability after
5592     // computing the scalarization overhead.
5593     InstructionCost ScalarCost =
5594         VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5595 
5596     // Compute the scalarization overhead of needed insertelement instructions
5597     // and phi nodes.
5598     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5599     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5600       ScalarCost += TTI.getScalarizationOverhead(
5601           cast<VectorType>(toVectorTy(I->getType(), VF)),
5602           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5603           /*Extract*/ false, CostKind);
5604       ScalarCost +=
5605           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5606     }
5607 
5608     // Compute the scalarization overhead of needed extractelement
5609     // instructions. For each of the instruction's operands, if the operand can
5610     // be scalarized, add it to the worklist; otherwise, account for the
5611     // overhead.
5612     for (Use &U : I->operands())
5613       if (auto *J = dyn_cast<Instruction>(U.get())) {
5614         assert(VectorType::isValidElementType(J->getType()) &&
5615                "Instruction has non-scalar type");
5616         if (CanBeScalarized(J))
5617           Worklist.push_back(J);
5618         else if (needsExtract(J, VF)) {
5619           ScalarCost += TTI.getScalarizationOverhead(
5620               cast<VectorType>(toVectorTy(J->getType(), VF)),
5621               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5622               /*Extract*/ true, CostKind);
5623         }
5624       }
5625 
5626     // Scale the total scalar cost by block probability.
5627     ScalarCost /= getReciprocalPredBlockProb();
5628 
5629     // Compute the discount. A non-negative discount means the vector version
5630     // of the instruction costs more, and scalarizing would be beneficial.
5631     Discount += VectorCost - ScalarCost;
5632     ScalarCosts[I] = ScalarCost;
5633   }
5634 
5635   return Discount;
5636 }
5637 
5638 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5639   InstructionCost Cost;
5640 
5641   // If the vector loop gets executed exactly once with the given VF, ignore the
5642   // costs of comparison and induction instructions, as they'll get simplified
5643   // away.
5644   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5645   auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5646   if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5647     addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5648                                          ValuesToIgnoreForVF);
5649 
5650   // For each block.
5651   for (BasicBlock *BB : TheLoop->blocks()) {
5652     InstructionCost BlockCost;
5653 
5654     // For each instruction in the old loop.
5655     for (Instruction &I : BB->instructionsWithoutDebug()) {
5656       // Skip ignored values.
5657       if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5658           (VF.isVector() && VecValuesToIgnore.count(&I)))
5659         continue;
5660 
5661       InstructionCost C = getInstructionCost(&I, VF);
5662 
5663       // Check if we should override the cost.
5664       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5665         C = InstructionCost(ForceTargetInstructionCost);
5666 
5667       BlockCost += C;
5668       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5669                         << VF << " For instruction: " << I << '\n');
5670     }
5671 
5672     // If we are vectorizing a predicated block, it will have been
5673     // if-converted. This means that the block's instructions (aside from
5674     // stores and instructions that may divide by zero) will now be
5675     // unconditionally executed. For the scalar case, we may not always execute
5676     // the predicated block, if it is an if-else block. Thus, scale the block's
5677     // cost by the probability of executing it. blockNeedsPredication from
5678     // Legal is used so as to not include all blocks in tail folded loops.
5679     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5680       BlockCost /= getReciprocalPredBlockProb();
5681 
5682     Cost += BlockCost;
5683   }
5684 
5685   return Cost;
5686 }
5687 
5688 /// Gets Address Access SCEV after verifying that the access pattern
5689 /// is loop invariant except the induction variable dependence.
5690 ///
5691 /// This SCEV can be sent to the Target in order to estimate the address
5692 /// calculation cost.
5693 static const SCEV *getAddressAccessSCEV(
5694               Value *Ptr,
5695               LoopVectorizationLegality *Legal,
5696               PredicatedScalarEvolution &PSE,
5697               const Loop *TheLoop) {
5698 
5699   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5700   if (!Gep)
5701     return nullptr;
5702 
5703   // We are looking for a gep with all loop invariant indices except for one
5704   // which should be an induction variable.
5705   auto *SE = PSE.getSE();
5706   unsigned NumOperands = Gep->getNumOperands();
5707   for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5708     Value *Opd = Gep->getOperand(Idx);
5709     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5710         !Legal->isInductionVariable(Opd))
5711       return nullptr;
5712   }
5713 
5714   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5715   return PSE.getSCEV(Ptr);
5716 }
5717 
5718 InstructionCost
5719 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5720                                                         ElementCount VF) {
5721   assert(VF.isVector() &&
5722          "Scalarization cost of instruction implies vectorization.");
5723   if (VF.isScalable())
5724     return InstructionCost::getInvalid();
5725 
5726   Type *ValTy = getLoadStoreType(I);
5727   auto *SE = PSE.getSE();
5728 
5729   unsigned AS = getLoadStoreAddressSpace(I);
5730   Value *Ptr = getLoadStorePointerOperand(I);
5731   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5732   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5733   //       that it is being called from this specific place.
5734 
5735   // Figure out whether the access is strided and get the stride value
5736   // if it's known in compile time
5737   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5738 
5739   // Get the cost of the scalar memory instruction and address computation.
5740   InstructionCost Cost =
5741       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5742 
5743   // Don't pass *I here, since it is scalar but will actually be part of a
5744   // vectorized loop where the user of it is a vectorized instruction.
5745   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5746   const Align Alignment = getLoadStoreAlignment(I);
5747   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5748                                                       ValTy->getScalarType(),
5749                                                       Alignment, AS, CostKind);
5750 
5751   // Get the overhead of the extractelement and insertelement instructions
5752   // we might create due to scalarization.
5753   Cost += getScalarizationOverhead(I, VF, CostKind);
5754 
5755   // If we have a predicated load/store, it will need extra i1 extracts and
5756   // conditional branches, but may not be executed for each vector lane. Scale
5757   // the cost by the probability of executing the predicated block.
5758   if (isPredicatedInst(I)) {
5759     Cost /= getReciprocalPredBlockProb();
5760 
5761     // Add the cost of an i1 extract and a branch
5762     auto *VecI1Ty =
5763         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5764     Cost += TTI.getScalarizationOverhead(
5765         VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5766         /*Insert=*/false, /*Extract=*/true, CostKind);
5767     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5768 
5769     if (useEmulatedMaskMemRefHack(I, VF))
5770       // Artificially setting to a high enough value to practically disable
5771       // vectorization with such operations.
5772       Cost = 3000000;
5773   }
5774 
5775   return Cost;
5776 }
5777 
5778 InstructionCost
5779 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5780                                                     ElementCount VF) {
5781   Type *ValTy = getLoadStoreType(I);
5782   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5783   Value *Ptr = getLoadStorePointerOperand(I);
5784   unsigned AS = getLoadStoreAddressSpace(I);
5785   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5786   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5787 
5788   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5789          "Stride should be 1 or -1 for consecutive memory access");
5790   const Align Alignment = getLoadStoreAlignment(I);
5791   InstructionCost Cost = 0;
5792   if (Legal->isMaskRequired(I)) {
5793     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5794                                       CostKind);
5795   } else {
5796     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5797     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5798                                 CostKind, OpInfo, I);
5799   }
5800 
5801   bool Reverse = ConsecutiveStride < 0;
5802   if (Reverse)
5803     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5804                                CostKind, 0);
5805   return Cost;
5806 }
5807 
5808 InstructionCost
5809 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5810                                                 ElementCount VF) {
5811   assert(Legal->isUniformMemOp(*I, VF));
5812 
5813   Type *ValTy = getLoadStoreType(I);
5814   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5815   const Align Alignment = getLoadStoreAlignment(I);
5816   unsigned AS = getLoadStoreAddressSpace(I);
5817   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5818   if (isa<LoadInst>(I)) {
5819     return TTI.getAddressComputationCost(ValTy) +
5820            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5821                                CostKind) +
5822            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5823   }
5824   StoreInst *SI = cast<StoreInst>(I);
5825 
5826   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5827   return TTI.getAddressComputationCost(ValTy) +
5828          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5829                              CostKind) +
5830          (IsLoopInvariantStoreValue
5831               ? 0
5832               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5833                                        CostKind, VF.getKnownMinValue() - 1));
5834 }
5835 
5836 InstructionCost
5837 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5838                                                  ElementCount VF) {
5839   Type *ValTy = getLoadStoreType(I);
5840   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5841   const Align Alignment = getLoadStoreAlignment(I);
5842   const Value *Ptr = getLoadStorePointerOperand(I);
5843 
5844   return TTI.getAddressComputationCost(VectorTy) +
5845          TTI.getGatherScatterOpCost(
5846              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5847              TargetTransformInfo::TCK_RecipThroughput, I);
5848 }
5849 
5850 InstructionCost
5851 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5852                                                    ElementCount VF) {
5853   const auto *Group = getInterleavedAccessGroup(I);
5854   assert(Group && "Fail to get an interleaved access group.");
5855 
5856   Instruction *InsertPos = Group->getInsertPos();
5857   Type *ValTy = getLoadStoreType(InsertPos);
5858   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5859   unsigned AS = getLoadStoreAddressSpace(InsertPos);
5860   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5861 
5862   unsigned InterleaveFactor = Group->getFactor();
5863   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5864 
5865   // Holds the indices of existing members in the interleaved group.
5866   SmallVector<unsigned, 4> Indices;
5867   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5868     if (Group->getMember(IF))
5869       Indices.push_back(IF);
5870 
5871   // Calculate the cost of the whole interleaved group.
5872   bool UseMaskForGaps =
5873       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5874       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5875   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5876       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5877       Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5878       UseMaskForGaps);
5879 
5880   if (Group->isReverse()) {
5881     // TODO: Add support for reversed masked interleaved access.
5882     assert(!Legal->isMaskRequired(I) &&
5883            "Reverse masked interleaved access not supported.");
5884     Cost += Group->getNumMembers() *
5885             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
5886                                CostKind, 0);
5887   }
5888   return Cost;
5889 }
5890 
5891 std::optional<InstructionCost>
5892 LoopVectorizationCostModel::getReductionPatternCost(
5893     Instruction *I, ElementCount VF, Type *Ty,
5894     TTI::TargetCostKind CostKind) const {
5895   using namespace llvm::PatternMatch;
5896   // Early exit for no inloop reductions
5897   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5898     return std::nullopt;
5899   auto *VectorTy = cast<VectorType>(Ty);
5900 
5901   // We are looking for a pattern of, and finding the minimal acceptable cost:
5902   //  reduce(mul(ext(A), ext(B))) or
5903   //  reduce(mul(A, B)) or
5904   //  reduce(ext(A)) or
5905   //  reduce(A).
5906   // The basic idea is that we walk down the tree to do that, finding the root
5907   // reduction instruction in InLoopReductionImmediateChains. From there we find
5908   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5909   // of the components. If the reduction cost is lower then we return it for the
5910   // reduction instruction and 0 for the other instructions in the pattern. If
5911   // it is not we return an invalid cost specifying the orignal cost method
5912   // should be used.
5913   Instruction *RetI = I;
5914   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5915     if (!RetI->hasOneUser())
5916       return std::nullopt;
5917     RetI = RetI->user_back();
5918   }
5919 
5920   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5921       RetI->user_back()->getOpcode() == Instruction::Add) {
5922     RetI = RetI->user_back();
5923   }
5924 
5925   // Test if the found instruction is a reduction, and if not return an invalid
5926   // cost specifying the parent to use the original cost modelling.
5927   if (!InLoopReductionImmediateChains.count(RetI))
5928     return std::nullopt;
5929 
5930   // Find the reduction this chain is a part of and calculate the basic cost of
5931   // the reduction on its own.
5932   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5933   Instruction *ReductionPhi = LastChain;
5934   while (!isa<PHINode>(ReductionPhi))
5935     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5936 
5937   const RecurrenceDescriptor &RdxDesc =
5938       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5939 
5940   InstructionCost BaseCost;
5941   RecurKind RK = RdxDesc.getRecurrenceKind();
5942   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
5943     Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5944     BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5945                                           RdxDesc.getFastMathFlags(), CostKind);
5946   } else {
5947     BaseCost = TTI.getArithmeticReductionCost(
5948         RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5949   }
5950 
5951   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5952   // normal fmul instruction to the cost of the fadd reduction.
5953   if (RK == RecurKind::FMulAdd)
5954     BaseCost +=
5955         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5956 
5957   // If we're using ordered reductions then we can just return the base cost
5958   // here, since getArithmeticReductionCost calculates the full ordered
5959   // reduction cost when FP reassociation is not allowed.
5960   if (useOrderedReductions(RdxDesc))
5961     return BaseCost;
5962 
5963   // Get the operand that was not the reduction chain and match it to one of the
5964   // patterns, returning the better cost if it is found.
5965   Instruction *RedOp = RetI->getOperand(1) == LastChain
5966                            ? dyn_cast<Instruction>(RetI->getOperand(0))
5967                            : dyn_cast<Instruction>(RetI->getOperand(1));
5968 
5969   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5970 
5971   Instruction *Op0, *Op1;
5972   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5973       match(RedOp,
5974             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
5975       match(Op0, m_ZExtOrSExt(m_Value())) &&
5976       Op0->getOpcode() == Op1->getOpcode() &&
5977       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5978       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5979       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5980 
5981     // Matched reduce.add(ext(mul(ext(A), ext(B)))
5982     // Note that the extend opcodes need to all match, or if A==B they will have
5983     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5984     // which is equally fine.
5985     bool IsUnsigned = isa<ZExtInst>(Op0);
5986     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5987     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5988 
5989     InstructionCost ExtCost =
5990         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5991                              TTI::CastContextHint::None, CostKind, Op0);
5992     InstructionCost MulCost =
5993         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5994     InstructionCost Ext2Cost =
5995         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5996                              TTI::CastContextHint::None, CostKind, RedOp);
5997 
5998     InstructionCost RedCost = TTI.getMulAccReductionCost(
5999         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6000 
6001     if (RedCost.isValid() &&
6002         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6003       return I == RetI ? RedCost : 0;
6004   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6005              !TheLoop->isLoopInvariant(RedOp)) {
6006     // Matched reduce(ext(A))
6007     bool IsUnsigned = isa<ZExtInst>(RedOp);
6008     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6009     InstructionCost RedCost = TTI.getExtendedReductionCost(
6010         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6011         RdxDesc.getFastMathFlags(), CostKind);
6012 
6013     InstructionCost ExtCost =
6014         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6015                              TTI::CastContextHint::None, CostKind, RedOp);
6016     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6017       return I == RetI ? RedCost : 0;
6018   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6019              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6020     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6021         Op0->getOpcode() == Op1->getOpcode() &&
6022         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6023       bool IsUnsigned = isa<ZExtInst>(Op0);
6024       Type *Op0Ty = Op0->getOperand(0)->getType();
6025       Type *Op1Ty = Op1->getOperand(0)->getType();
6026       Type *LargestOpTy =
6027           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6028                                                                     : Op0Ty;
6029       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6030 
6031       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6032       // different sizes. We take the largest type as the ext to reduce, and add
6033       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6034       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6035           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6036           TTI::CastContextHint::None, CostKind, Op0);
6037       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6038           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6039           TTI::CastContextHint::None, CostKind, Op1);
6040       InstructionCost MulCost =
6041           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6042 
6043       InstructionCost RedCost = TTI.getMulAccReductionCost(
6044           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6045       InstructionCost ExtraExtCost = 0;
6046       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6047         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6048         ExtraExtCost = TTI.getCastInstrCost(
6049             ExtraExtOp->getOpcode(), ExtType,
6050             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6051             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6052       }
6053 
6054       if (RedCost.isValid() &&
6055           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6056         return I == RetI ? RedCost : 0;
6057     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6058       // Matched reduce.add(mul())
6059       InstructionCost MulCost =
6060           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6061 
6062       InstructionCost RedCost = TTI.getMulAccReductionCost(
6063           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6064 
6065       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6066         return I == RetI ? RedCost : 0;
6067     }
6068   }
6069 
6070   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6071 }
6072 
6073 InstructionCost
6074 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6075                                                      ElementCount VF) {
6076   // Calculate scalar cost only. Vectorization cost should be ready at this
6077   // moment.
6078   if (VF.isScalar()) {
6079     Type *ValTy = getLoadStoreType(I);
6080     const Align Alignment = getLoadStoreAlignment(I);
6081     unsigned AS = getLoadStoreAddressSpace(I);
6082 
6083     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6084     return TTI.getAddressComputationCost(ValTy) +
6085            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6086                                TTI::TCK_RecipThroughput, OpInfo, I);
6087   }
6088   return getWideningCost(I, VF);
6089 }
6090 
6091 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6092     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6093 
6094   // There is no mechanism yet to create a scalable scalarization loop,
6095   // so this is currently Invalid.
6096   if (VF.isScalable())
6097     return InstructionCost::getInvalid();
6098 
6099   if (VF.isScalar())
6100     return 0;
6101 
6102   InstructionCost Cost = 0;
6103   Type *RetTy = toVectorTy(I->getType(), VF);
6104   if (!RetTy->isVoidTy() &&
6105       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6106     Cost += TTI.getScalarizationOverhead(
6107         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6108         /*Insert*/ true,
6109         /*Extract*/ false, CostKind);
6110 
6111   // Some targets keep addresses scalar.
6112   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6113     return Cost;
6114 
6115   // Some targets support efficient element stores.
6116   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6117     return Cost;
6118 
6119   // Collect operands to consider.
6120   CallInst *CI = dyn_cast<CallInst>(I);
6121   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6122 
6123   // Skip operands that do not require extraction/scalarization and do not incur
6124   // any overhead.
6125   SmallVector<Type *> Tys;
6126   for (auto *V : filterExtractingOperands(Ops, VF))
6127     Tys.push_back(maybeVectorizeType(V->getType(), VF));
6128   return Cost + TTI.getOperandsScalarizationOverhead(
6129                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6130 }
6131 
6132 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6133   if (VF.isScalar())
6134     return;
6135   NumPredStores = 0;
6136   for (BasicBlock *BB : TheLoop->blocks()) {
6137     // For each instruction in the old loop.
6138     for (Instruction &I : *BB) {
6139       Value *Ptr =  getLoadStorePointerOperand(&I);
6140       if (!Ptr)
6141         continue;
6142 
6143       // TODO: We should generate better code and update the cost model for
6144       // predicated uniform stores. Today they are treated as any other
6145       // predicated store (see added test cases in
6146       // invariant-store-vectorization.ll).
6147       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6148         NumPredStores++;
6149 
6150       if (Legal->isUniformMemOp(I, VF)) {
6151         auto IsLegalToScalarize = [&]() {
6152           if (!VF.isScalable())
6153             // Scalarization of fixed length vectors "just works".
6154             return true;
6155 
6156           // We have dedicated lowering for unpredicated uniform loads and
6157           // stores.  Note that even with tail folding we know that at least
6158           // one lane is active (i.e. generalized predication is not possible
6159           // here), and the logic below depends on this fact.
6160           if (!foldTailByMasking())
6161             return true;
6162 
6163           // For scalable vectors, a uniform memop load is always
6164           // uniform-by-parts  and we know how to scalarize that.
6165           if (isa<LoadInst>(I))
6166             return true;
6167 
6168           // A uniform store isn't neccessarily uniform-by-part
6169           // and we can't assume scalarization.
6170           auto &SI = cast<StoreInst>(I);
6171           return TheLoop->isLoopInvariant(SI.getValueOperand());
6172         };
6173 
6174         const InstructionCost GatherScatterCost =
6175           isLegalGatherOrScatter(&I, VF) ?
6176           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6177 
6178         // Load: Scalar load + broadcast
6179         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6180         // FIXME: This cost is a significant under-estimate for tail folded
6181         // memory ops.
6182         const InstructionCost ScalarizationCost =
6183             IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6184                                  : InstructionCost::getInvalid();
6185 
6186         // Choose better solution for the current VF,  Note that Invalid
6187         // costs compare as maximumal large.  If both are invalid, we get
6188         // scalable invalid which signals a failure and a vectorization abort.
6189         if (GatherScatterCost < ScalarizationCost)
6190           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6191         else
6192           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6193         continue;
6194       }
6195 
6196       // We assume that widening is the best solution when possible.
6197       if (memoryInstructionCanBeWidened(&I, VF)) {
6198         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6199         int ConsecutiveStride = Legal->isConsecutivePtr(
6200             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6201         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6202                "Expected consecutive stride.");
6203         InstWidening Decision =
6204             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6205         setWideningDecision(&I, VF, Decision, Cost);
6206         continue;
6207       }
6208 
6209       // Choose between Interleaving, Gather/Scatter or Scalarization.
6210       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6211       unsigned NumAccesses = 1;
6212       if (isAccessInterleaved(&I)) {
6213         const auto *Group = getInterleavedAccessGroup(&I);
6214         assert(Group && "Fail to get an interleaved access group.");
6215 
6216         // Make one decision for the whole group.
6217         if (getWideningDecision(&I, VF) != CM_Unknown)
6218           continue;
6219 
6220         NumAccesses = Group->getNumMembers();
6221         if (interleavedAccessCanBeWidened(&I, VF))
6222           InterleaveCost = getInterleaveGroupCost(&I, VF);
6223       }
6224 
6225       InstructionCost GatherScatterCost =
6226           isLegalGatherOrScatter(&I, VF)
6227               ? getGatherScatterCost(&I, VF) * NumAccesses
6228               : InstructionCost::getInvalid();
6229 
6230       InstructionCost ScalarizationCost =
6231           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6232 
6233       // Choose better solution for the current VF,
6234       // write down this decision and use it during vectorization.
6235       InstructionCost Cost;
6236       InstWidening Decision;
6237       if (InterleaveCost <= GatherScatterCost &&
6238           InterleaveCost < ScalarizationCost) {
6239         Decision = CM_Interleave;
6240         Cost = InterleaveCost;
6241       } else if (GatherScatterCost < ScalarizationCost) {
6242         Decision = CM_GatherScatter;
6243         Cost = GatherScatterCost;
6244       } else {
6245         Decision = CM_Scalarize;
6246         Cost = ScalarizationCost;
6247       }
6248       // If the instructions belongs to an interleave group, the whole group
6249       // receives the same decision. The whole group receives the cost, but
6250       // the cost will actually be assigned to one instruction.
6251       if (const auto *Group = getInterleavedAccessGroup(&I))
6252         setWideningDecision(Group, VF, Decision, Cost);
6253       else
6254         setWideningDecision(&I, VF, Decision, Cost);
6255     }
6256   }
6257 
6258   // Make sure that any load of address and any other address computation
6259   // remains scalar unless there is gather/scatter support. This avoids
6260   // inevitable extracts into address registers, and also has the benefit of
6261   // activating LSR more, since that pass can't optimize vectorized
6262   // addresses.
6263   if (TTI.prefersVectorizedAddressing())
6264     return;
6265 
6266   // Start with all scalar pointer uses.
6267   SmallPtrSet<Instruction *, 8> AddrDefs;
6268   for (BasicBlock *BB : TheLoop->blocks())
6269     for (Instruction &I : *BB) {
6270       Instruction *PtrDef =
6271         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6272       if (PtrDef && TheLoop->contains(PtrDef) &&
6273           getWideningDecision(&I, VF) != CM_GatherScatter)
6274         AddrDefs.insert(PtrDef);
6275     }
6276 
6277   // Add all instructions used to generate the addresses.
6278   SmallVector<Instruction *, 4> Worklist;
6279   append_range(Worklist, AddrDefs);
6280   while (!Worklist.empty()) {
6281     Instruction *I = Worklist.pop_back_val();
6282     for (auto &Op : I->operands())
6283       if (auto *InstOp = dyn_cast<Instruction>(Op))
6284         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6285             AddrDefs.insert(InstOp).second)
6286           Worklist.push_back(InstOp);
6287   }
6288 
6289   for (auto *I : AddrDefs) {
6290     if (isa<LoadInst>(I)) {
6291       // Setting the desired widening decision should ideally be handled in
6292       // by cost functions, but since this involves the task of finding out
6293       // if the loaded register is involved in an address computation, it is
6294       // instead changed here when we know this is the case.
6295       InstWidening Decision = getWideningDecision(I, VF);
6296       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6297         // Scalarize a widened load of address.
6298         setWideningDecision(
6299             I, VF, CM_Scalarize,
6300             (VF.getKnownMinValue() *
6301              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6302       else if (const auto *Group = getInterleavedAccessGroup(I)) {
6303         // Scalarize an interleave group of address loads.
6304         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6305           if (Instruction *Member = Group->getMember(I))
6306             setWideningDecision(
6307                 Member, VF, CM_Scalarize,
6308                 (VF.getKnownMinValue() *
6309                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6310         }
6311       }
6312     } else
6313       // Make sure I gets scalarized and a cost estimate without
6314       // scalarization overhead.
6315       ForcedScalars[VF].insert(I);
6316   }
6317 }
6318 
6319 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6320   assert(!VF.isScalar() &&
6321          "Trying to set a vectorization decision for a scalar VF");
6322 
6323   auto ForcedScalar = ForcedScalars.find(VF);
6324   for (BasicBlock *BB : TheLoop->blocks()) {
6325     // For each instruction in the old loop.
6326     for (Instruction &I : *BB) {
6327       CallInst *CI = dyn_cast<CallInst>(&I);
6328 
6329       if (!CI)
6330         continue;
6331 
6332       InstructionCost ScalarCost = InstructionCost::getInvalid();
6333       InstructionCost VectorCost = InstructionCost::getInvalid();
6334       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6335       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6336       Function *ScalarFunc = CI->getCalledFunction();
6337       Type *ScalarRetTy = CI->getType();
6338       SmallVector<Type *, 4> Tys, ScalarTys;
6339       for (auto &ArgOp : CI->args())
6340         ScalarTys.push_back(ArgOp->getType());
6341 
6342       // Estimate cost of scalarized vector call. The source operands are
6343       // assumed to be vectors, so we need to extract individual elements from
6344       // there, execute VF scalar calls, and then gather the result into the
6345       // vector return value.
6346       InstructionCost ScalarCallCost =
6347           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6348 
6349       // Compute costs of unpacking argument values for the scalar calls and
6350       // packing the return values to a vector.
6351       InstructionCost ScalarizationCost =
6352           getScalarizationOverhead(CI, VF, CostKind);
6353 
6354       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6355       // Honor ForcedScalars and UniformAfterVectorization decisions.
6356       // TODO: For calls, it might still be more profitable to widen. Use
6357       // VPlan-based cost model to compare different options.
6358       if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6359                              ForcedScalar->second.contains(CI)) ||
6360                             isUniformAfterVectorization(CI, VF))) {
6361         setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6362                                 Intrinsic::not_intrinsic, std::nullopt,
6363                                 ScalarCost);
6364         continue;
6365       }
6366 
6367       bool MaskRequired = Legal->isMaskRequired(CI);
6368       // Compute corresponding vector type for return value and arguments.
6369       Type *RetTy = toVectorTy(ScalarRetTy, VF);
6370       for (Type *ScalarTy : ScalarTys)
6371         Tys.push_back(toVectorTy(ScalarTy, VF));
6372 
6373       // An in-loop reduction using an fmuladd intrinsic is a special case;
6374       // we don't want the normal cost for that intrinsic.
6375       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6376         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6377           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6378                                   getVectorIntrinsicIDForCall(CI, TLI),
6379                                   std::nullopt, *RedCost);
6380           continue;
6381         }
6382 
6383       // Find the cost of vectorizing the call, if we can find a suitable
6384       // vector variant of the function.
6385       bool UsesMask = false;
6386       VFInfo FuncInfo;
6387       Function *VecFunc = nullptr;
6388       // Search through any available variants for one we can use at this VF.
6389       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6390         // Must match requested VF.
6391         if (Info.Shape.VF != VF)
6392           continue;
6393 
6394         // Must take a mask argument if one is required
6395         if (MaskRequired && !Info.isMasked())
6396           continue;
6397 
6398         // Check that all parameter kinds are supported
6399         bool ParamsOk = true;
6400         for (VFParameter Param : Info.Shape.Parameters) {
6401           switch (Param.ParamKind) {
6402           case VFParamKind::Vector:
6403             break;
6404           case VFParamKind::OMP_Uniform: {
6405             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6406             // Make sure the scalar parameter in the loop is invariant.
6407             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6408                                               TheLoop))
6409               ParamsOk = false;
6410             break;
6411           }
6412           case VFParamKind::OMP_Linear: {
6413             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6414             // Find the stride for the scalar parameter in this loop and see if
6415             // it matches the stride for the variant.
6416             // TODO: do we need to figure out the cost of an extract to get the
6417             // first lane? Or do we hope that it will be folded away?
6418             ScalarEvolution *SE = PSE.getSE();
6419             const auto *SAR =
6420                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6421 
6422             if (!SAR || SAR->getLoop() != TheLoop) {
6423               ParamsOk = false;
6424               break;
6425             }
6426 
6427             const SCEVConstant *Step =
6428                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6429 
6430             if (!Step ||
6431                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6432               ParamsOk = false;
6433 
6434             break;
6435           }
6436           case VFParamKind::GlobalPredicate:
6437             UsesMask = true;
6438             break;
6439           default:
6440             ParamsOk = false;
6441             break;
6442           }
6443         }
6444 
6445         if (!ParamsOk)
6446           continue;
6447 
6448         // Found a suitable candidate, stop here.
6449         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6450         FuncInfo = Info;
6451         break;
6452       }
6453 
6454       // Add in the cost of synthesizing a mask if one wasn't required.
6455       InstructionCost MaskCost = 0;
6456       if (VecFunc && UsesMask && !MaskRequired)
6457         MaskCost = TTI.getShuffleCost(
6458             TargetTransformInfo::SK_Broadcast,
6459             VectorType::get(IntegerType::getInt1Ty(
6460                                 VecFunc->getFunctionType()->getContext()),
6461                             VF));
6462 
6463       if (TLI && VecFunc && !CI->isNoBuiltin())
6464         VectorCost =
6465             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6466 
6467       // Find the cost of an intrinsic; some targets may have instructions that
6468       // perform the operation without needing an actual call.
6469       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6470       if (IID != Intrinsic::not_intrinsic)
6471         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6472 
6473       InstructionCost Cost = ScalarCost;
6474       InstWidening Decision = CM_Scalarize;
6475 
6476       if (VectorCost <= Cost) {
6477         Cost = VectorCost;
6478         Decision = CM_VectorCall;
6479       }
6480 
6481       if (IntrinsicCost <= Cost) {
6482         Cost = IntrinsicCost;
6483         Decision = CM_IntrinsicCall;
6484       }
6485 
6486       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6487                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6488     }
6489   }
6490 }
6491 
6492 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6493   if (!Legal->isInvariant(Op))
6494     return false;
6495   // Consider Op invariant, if it or its operands aren't predicated
6496   // instruction in the loop. In that case, it is not trivially hoistable.
6497   auto *OpI = dyn_cast<Instruction>(Op);
6498   return !OpI || !TheLoop->contains(OpI) ||
6499          (!isPredicatedInst(OpI) &&
6500           (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6501           all_of(OpI->operands(),
6502                  [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6503 }
6504 
6505 InstructionCost
6506 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6507                                                ElementCount VF) {
6508   // If we know that this instruction will remain uniform, check the cost of
6509   // the scalar version.
6510   if (isUniformAfterVectorization(I, VF))
6511     VF = ElementCount::getFixed(1);
6512 
6513   if (VF.isVector() && isProfitableToScalarize(I, VF))
6514     return InstsToScalarize[VF][I];
6515 
6516   // Forced scalars do not have any scalarization overhead.
6517   auto ForcedScalar = ForcedScalars.find(VF);
6518   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6519     auto InstSet = ForcedScalar->second;
6520     if (InstSet.count(I))
6521       return getInstructionCost(I, ElementCount::getFixed(1)) *
6522              VF.getKnownMinValue();
6523   }
6524 
6525   Type *RetTy = I->getType();
6526   if (canTruncateToMinimalBitwidth(I, VF))
6527     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6528   auto *SE = PSE.getSE();
6529   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6530 
6531   auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6532                                                 ElementCount VF) -> bool {
6533     if (VF.isScalar())
6534       return true;
6535 
6536     auto Scalarized = InstsToScalarize.find(VF);
6537     assert(Scalarized != InstsToScalarize.end() &&
6538            "VF not yet analyzed for scalarization profitability");
6539     return !Scalarized->second.count(I) &&
6540            llvm::all_of(I->users(), [&](User *U) {
6541              auto *UI = cast<Instruction>(U);
6542              return !Scalarized->second.count(UI);
6543            });
6544   };
6545   (void)HasSingleCopyAfterVectorization;
6546 
6547   Type *VectorTy;
6548   if (isScalarAfterVectorization(I, VF)) {
6549     // With the exception of GEPs and PHIs, after scalarization there should
6550     // only be one copy of the instruction generated in the loop. This is
6551     // because the VF is either 1, or any instructions that need scalarizing
6552     // have already been dealt with by the time we get here. As a result,
6553     // it means we don't have to multiply the instruction cost by VF.
6554     assert(I->getOpcode() == Instruction::GetElementPtr ||
6555            I->getOpcode() == Instruction::PHI ||
6556            (I->getOpcode() == Instruction::BitCast &&
6557             I->getType()->isPointerTy()) ||
6558            HasSingleCopyAfterVectorization(I, VF));
6559     VectorTy = RetTy;
6560   } else
6561     VectorTy = toVectorTy(RetTy, VF);
6562 
6563   if (VF.isVector() && VectorTy->isVectorTy() &&
6564       !TTI.getNumberOfParts(VectorTy))
6565     return InstructionCost::getInvalid();
6566 
6567   // TODO: We need to estimate the cost of intrinsic calls.
6568   switch (I->getOpcode()) {
6569   case Instruction::GetElementPtr:
6570     // We mark this instruction as zero-cost because the cost of GEPs in
6571     // vectorized code depends on whether the corresponding memory instruction
6572     // is scalarized or not. Therefore, we handle GEPs with the memory
6573     // instruction cost.
6574     return 0;
6575   case Instruction::Br: {
6576     // In cases of scalarized and predicated instructions, there will be VF
6577     // predicated blocks in the vectorized loop. Each branch around these
6578     // blocks requires also an extract of its vector compare i1 element.
6579     // Note that the conditional branch from the loop latch will be replaced by
6580     // a single branch controlling the loop, so there is no extra overhead from
6581     // scalarization.
6582     bool ScalarPredicatedBB = false;
6583     BranchInst *BI = cast<BranchInst>(I);
6584     if (VF.isVector() && BI->isConditional() &&
6585         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6586          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6587         BI->getParent() != TheLoop->getLoopLatch())
6588       ScalarPredicatedBB = true;
6589 
6590     if (ScalarPredicatedBB) {
6591       // Not possible to scalarize scalable vector with predicated instructions.
6592       if (VF.isScalable())
6593         return InstructionCost::getInvalid();
6594       // Return cost for branches around scalarized and predicated blocks.
6595       auto *VecI1Ty =
6596           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6597       return (
6598           TTI.getScalarizationOverhead(
6599               VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6600               /*Insert*/ false, /*Extract*/ true, CostKind) +
6601           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6602     }
6603 
6604     if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6605       // The back-edge branch will remain, as will all scalar branches.
6606       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6607 
6608     // This branch will be eliminated by if-conversion.
6609     return 0;
6610     // Note: We currently assume zero cost for an unconditional branch inside
6611     // a predicated block since it will become a fall-through, although we
6612     // may decide in the future to call TTI for all branches.
6613   }
6614   case Instruction::Switch: {
6615     if (VF.isScalar())
6616       return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6617     auto *Switch = cast<SwitchInst>(I);
6618     return Switch->getNumCases() *
6619            TTI.getCmpSelInstrCost(
6620                Instruction::ICmp,
6621                toVectorTy(Switch->getCondition()->getType(), VF),
6622                toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6623                CmpInst::ICMP_EQ, CostKind);
6624   }
6625   case Instruction::PHI: {
6626     auto *Phi = cast<PHINode>(I);
6627 
6628     // First-order recurrences are replaced by vector shuffles inside the loop.
6629     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6630       // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6631       // penultimate value of the recurrence.
6632       // TODO: Consider vscale_range info.
6633       if (VF.isScalable() && VF.getKnownMinValue() == 1)
6634         return InstructionCost::getInvalid();
6635       SmallVector<int> Mask(VF.getKnownMinValue());
6636       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6637       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6638                                 cast<VectorType>(VectorTy), Mask, CostKind,
6639                                 VF.getKnownMinValue() - 1);
6640     }
6641 
6642     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6643     // converted into select instructions. We require N - 1 selects per phi
6644     // node, where N is the number of incoming values.
6645     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6646       Type *ResultTy = Phi->getType();
6647 
6648       // All instructions in an Any-of reduction chain are narrowed to bool.
6649       // Check if that is the case for this phi node.
6650       auto *HeaderUser = cast_if_present<PHINode>(
6651           find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6652             auto *Phi = dyn_cast<PHINode>(U);
6653             if (Phi && Phi->getParent() == TheLoop->getHeader())
6654               return Phi;
6655             return nullptr;
6656           }));
6657       if (HeaderUser) {
6658         auto &ReductionVars = Legal->getReductionVars();
6659         auto Iter = ReductionVars.find(HeaderUser);
6660         if (Iter != ReductionVars.end() &&
6661             RecurrenceDescriptor::isAnyOfRecurrenceKind(
6662                 Iter->second.getRecurrenceKind()))
6663           ResultTy = Type::getInt1Ty(Phi->getContext());
6664       }
6665       return (Phi->getNumIncomingValues() - 1) *
6666              TTI.getCmpSelInstrCost(
6667                  Instruction::Select, toVectorTy(ResultTy, VF),
6668                  toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6669                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6670     }
6671 
6672     // When tail folding with EVL, if the phi is part of an out of loop
6673     // reduction then it will be transformed into a wide vp_merge.
6674     if (VF.isVector() && foldTailWithEVL() &&
6675         Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6676       IntrinsicCostAttributes ICA(
6677           Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6678           {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6679       return TTI.getIntrinsicInstrCost(ICA, CostKind);
6680     }
6681 
6682     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6683   }
6684   case Instruction::UDiv:
6685   case Instruction::SDiv:
6686   case Instruction::URem:
6687   case Instruction::SRem:
6688     if (VF.isVector() && isPredicatedInst(I)) {
6689       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6690       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6691         ScalarCost : SafeDivisorCost;
6692     }
6693     // We've proven all lanes safe to speculate, fall through.
6694     [[fallthrough]];
6695   case Instruction::Add:
6696   case Instruction::Sub: {
6697     auto Info = Legal->getHistogramInfo(I);
6698     if (Info && VF.isVector()) {
6699       const HistogramInfo *HGram = Info.value();
6700       // Assume that a non-constant update value (or a constant != 1) requires
6701       // a multiply, and add that into the cost.
6702       InstructionCost MulCost = TTI::TCC_Free;
6703       ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6704       if (!RHS || RHS->getZExtValue() != 1)
6705         MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
6706 
6707       // Find the cost of the histogram operation itself.
6708       Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6709       Type *ScalarTy = I->getType();
6710       Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6711       IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6712                                   Type::getVoidTy(I->getContext()),
6713                                   {PtrTy, ScalarTy, MaskTy});
6714 
6715       // Add the costs together with the add/sub operation.
6716       return TTI.getIntrinsicInstrCost(
6717                  ICA, TargetTransformInfo::TCK_RecipThroughput) +
6718              MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
6719     }
6720     [[fallthrough]];
6721   }
6722   case Instruction::FAdd:
6723   case Instruction::FSub:
6724   case Instruction::Mul:
6725   case Instruction::FMul:
6726   case Instruction::FDiv:
6727   case Instruction::FRem:
6728   case Instruction::Shl:
6729   case Instruction::LShr:
6730   case Instruction::AShr:
6731   case Instruction::And:
6732   case Instruction::Or:
6733   case Instruction::Xor: {
6734     // If we're speculating on the stride being 1, the multiplication may
6735     // fold away.  We can generalize this for all operations using the notion
6736     // of neutral elements.  (TODO)
6737     if (I->getOpcode() == Instruction::Mul &&
6738         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6739          PSE.getSCEV(I->getOperand(1))->isOne()))
6740       return 0;
6741 
6742     // Detect reduction patterns
6743     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6744       return *RedCost;
6745 
6746     // Certain instructions can be cheaper to vectorize if they have a constant
6747     // second vector operand. One example of this are shifts on x86.
6748     Value *Op2 = I->getOperand(1);
6749     if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6750         isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6751       Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6752     }
6753     auto Op2Info = TTI.getOperandInfo(Op2);
6754     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6755         shouldConsiderInvariant(Op2))
6756       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6757 
6758     SmallVector<const Value *, 4> Operands(I->operand_values());
6759     return TTI.getArithmeticInstrCost(
6760         I->getOpcode(), VectorTy, CostKind,
6761         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6762         Op2Info, Operands, I, TLI);
6763   }
6764   case Instruction::FNeg: {
6765     return TTI.getArithmeticInstrCost(
6766         I->getOpcode(), VectorTy, CostKind,
6767         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6768         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6769         I->getOperand(0), I);
6770   }
6771   case Instruction::Select: {
6772     SelectInst *SI = cast<SelectInst>(I);
6773     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6774     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6775 
6776     const Value *Op0, *Op1;
6777     using namespace llvm::PatternMatch;
6778     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6779                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6780       // select x, y, false --> x & y
6781       // select x, true, y --> x | y
6782       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6783       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6784       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6785               Op1->getType()->getScalarSizeInBits() == 1);
6786 
6787       SmallVector<const Value *, 2> Operands{Op0, Op1};
6788       return TTI.getArithmeticInstrCost(
6789           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6790           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6791     }
6792 
6793     Type *CondTy = SI->getCondition()->getType();
6794     if (!ScalarCond)
6795       CondTy = VectorType::get(CondTy, VF);
6796 
6797     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6798     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6799       Pred = Cmp->getPredicate();
6800     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6801                                   CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6802                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6803   }
6804   case Instruction::ICmp:
6805   case Instruction::FCmp: {
6806     Type *ValTy = I->getOperand(0)->getType();
6807 
6808     if (canTruncateToMinimalBitwidth(I, VF)) {
6809       Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6810       (void)Op0AsInstruction;
6811       assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6812               MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6813              "if both the operand and the compare are marked for "
6814              "truncation, they must have the same bitwidth");
6815       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6816     }
6817 
6818     VectorTy = toVectorTy(ValTy, VF);
6819     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6820                                   cast<CmpInst>(I)->getPredicate(), CostKind,
6821                                   {TTI::OK_AnyValue, TTI::OP_None},
6822                                   {TTI::OK_AnyValue, TTI::OP_None}, I);
6823   }
6824   case Instruction::Store:
6825   case Instruction::Load: {
6826     ElementCount Width = VF;
6827     if (Width.isVector()) {
6828       InstWidening Decision = getWideningDecision(I, Width);
6829       assert(Decision != CM_Unknown &&
6830              "CM decision should be taken at this point");
6831       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6832         return InstructionCost::getInvalid();
6833       if (Decision == CM_Scalarize)
6834         Width = ElementCount::getFixed(1);
6835     }
6836     VectorTy = toVectorTy(getLoadStoreType(I), Width);
6837     return getMemoryInstructionCost(I, VF);
6838   }
6839   case Instruction::BitCast:
6840     if (I->getType()->isPointerTy())
6841       return 0;
6842     [[fallthrough]];
6843   case Instruction::ZExt:
6844   case Instruction::SExt:
6845   case Instruction::FPToUI:
6846   case Instruction::FPToSI:
6847   case Instruction::FPExt:
6848   case Instruction::PtrToInt:
6849   case Instruction::IntToPtr:
6850   case Instruction::SIToFP:
6851   case Instruction::UIToFP:
6852   case Instruction::Trunc:
6853   case Instruction::FPTrunc: {
6854     // Computes the CastContextHint from a Load/Store instruction.
6855     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6856       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6857              "Expected a load or a store!");
6858 
6859       if (VF.isScalar() || !TheLoop->contains(I))
6860         return TTI::CastContextHint::Normal;
6861 
6862       switch (getWideningDecision(I, VF)) {
6863       case LoopVectorizationCostModel::CM_GatherScatter:
6864         return TTI::CastContextHint::GatherScatter;
6865       case LoopVectorizationCostModel::CM_Interleave:
6866         return TTI::CastContextHint::Interleave;
6867       case LoopVectorizationCostModel::CM_Scalarize:
6868       case LoopVectorizationCostModel::CM_Widen:
6869         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6870                                         : TTI::CastContextHint::Normal;
6871       case LoopVectorizationCostModel::CM_Widen_Reverse:
6872         return TTI::CastContextHint::Reversed;
6873       case LoopVectorizationCostModel::CM_Unknown:
6874         llvm_unreachable("Instr did not go through cost modelling?");
6875       case LoopVectorizationCostModel::CM_VectorCall:
6876       case LoopVectorizationCostModel::CM_IntrinsicCall:
6877         llvm_unreachable_internal("Instr has invalid widening decision");
6878       }
6879 
6880       llvm_unreachable("Unhandled case!");
6881     };
6882 
6883     unsigned Opcode = I->getOpcode();
6884     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6885     // For Trunc, the context is the only user, which must be a StoreInst.
6886     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6887       if (I->hasOneUse())
6888         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6889           CCH = ComputeCCH(Store);
6890     }
6891     // For Z/Sext, the context is the operand, which must be a LoadInst.
6892     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6893              Opcode == Instruction::FPExt) {
6894       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6895         CCH = ComputeCCH(Load);
6896     }
6897 
6898     // We optimize the truncation of induction variables having constant
6899     // integer steps. The cost of these truncations is the same as the scalar
6900     // operation.
6901     if (isOptimizableIVTruncate(I, VF)) {
6902       auto *Trunc = cast<TruncInst>(I);
6903       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6904                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6905     }
6906 
6907     // Detect reduction patterns
6908     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6909       return *RedCost;
6910 
6911     Type *SrcScalarTy = I->getOperand(0)->getType();
6912     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6913     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6914       SrcScalarTy =
6915           IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6916     Type *SrcVecTy =
6917         VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6918 
6919     if (canTruncateToMinimalBitwidth(I, VF)) {
6920       // If the result type is <= the source type, there will be no extend
6921       // after truncating the users to the minimal required bitwidth.
6922       if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6923           (I->getOpcode() == Instruction::ZExt ||
6924            I->getOpcode() == Instruction::SExt))
6925         return 0;
6926     }
6927 
6928     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6929   }
6930   case Instruction::Call:
6931     return getVectorCallCost(cast<CallInst>(I), VF);
6932   case Instruction::ExtractValue:
6933     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
6934   case Instruction::Alloca:
6935     // We cannot easily widen alloca to a scalable alloca, as
6936     // the result would need to be a vector of pointers.
6937     if (VF.isScalable())
6938       return InstructionCost::getInvalid();
6939     [[fallthrough]];
6940   default:
6941     // This opcode is unknown. Assume that it is the same as 'mul'.
6942     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6943   } // end of switch.
6944 }
6945 
6946 void LoopVectorizationCostModel::collectValuesToIgnore() {
6947   // Ignore ephemeral values.
6948   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6949 
6950   SmallVector<Value *, 4> DeadInterleavePointerOps;
6951   SmallVector<Value *, 4> DeadOps;
6952 
6953   // If a scalar epilogue is required, users outside the loop won't use
6954   // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6955   // that is the case.
6956   bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6957   auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6958     return RequiresScalarEpilogue &&
6959            !TheLoop->contains(cast<Instruction>(U)->getParent());
6960   };
6961 
6962   LoopBlocksDFS DFS(TheLoop);
6963   DFS.perform(LI);
6964   MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6965   for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6966     for (Instruction &I : reverse(*BB)) {
6967       // Find all stores to invariant variables. Since they are going to sink
6968       // outside the loop we do not need calculate cost for them.
6969       StoreInst *SI;
6970       if ((SI = dyn_cast<StoreInst>(&I)) &&
6971           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6972         ValuesToIgnore.insert(&I);
6973         DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6974             SI->getValueOperand());
6975       }
6976 
6977       if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6978         continue;
6979 
6980       // Add instructions that would be trivially dead and are only used by
6981       // values already ignored to DeadOps to seed worklist.
6982       if (wouldInstructionBeTriviallyDead(&I, TLI) &&
6983           all_of(I.users(), [this, IsLiveOutDead](User *U) {
6984             return VecValuesToIgnore.contains(U) ||
6985                    ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6986           }))
6987         DeadOps.push_back(&I);
6988 
6989       // For interleave groups, we only create a pointer for the start of the
6990       // interleave group. Queue up addresses of group members except the insert
6991       // position for further processing.
6992       if (isAccessInterleaved(&I)) {
6993         auto *Group = getInterleavedAccessGroup(&I);
6994         if (Group->getInsertPos() == &I)
6995           continue;
6996         Value *PointerOp = getLoadStorePointerOperand(&I);
6997         DeadInterleavePointerOps.push_back(PointerOp);
6998       }
6999 
7000       // Queue branches for analysis. They are dead, if their successors only
7001       // contain dead instructions.
7002       if (auto *Br = dyn_cast<BranchInst>(&I)) {
7003         if (Br->isConditional())
7004           DeadOps.push_back(&I);
7005       }
7006     }
7007 
7008   // Mark ops feeding interleave group members as free, if they are only used
7009   // by other dead computations.
7010   for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
7011     auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
7012     if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
7013           Instruction *UI = cast<Instruction>(U);
7014           return !VecValuesToIgnore.contains(U) &&
7015                  (!isAccessInterleaved(UI) ||
7016                   getInterleavedAccessGroup(UI)->getInsertPos() == UI);
7017         }))
7018       continue;
7019     VecValuesToIgnore.insert(Op);
7020     DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
7021   }
7022 
7023   for (const auto &[_, Ops] : DeadInvariantStoreOps) {
7024     for (Value *Op : ArrayRef(Ops).drop_back())
7025       DeadOps.push_back(Op);
7026   }
7027   // Mark ops that would be trivially dead and are only used by ignored
7028   // instructions as free.
7029   BasicBlock *Header = TheLoop->getHeader();
7030 
7031   // Returns true if the block contains only dead instructions. Such blocks will
7032   // be removed by VPlan-to-VPlan transforms and won't be considered by the
7033   // VPlan-based cost model, so skip them in the legacy cost-model as well.
7034   auto IsEmptyBlock = [this](BasicBlock *BB) {
7035     return all_of(*BB, [this](Instruction &I) {
7036       return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
7037              (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
7038     });
7039   };
7040   for (unsigned I = 0; I != DeadOps.size(); ++I) {
7041     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
7042 
7043     // Check if the branch should be considered dead.
7044     if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
7045       BasicBlock *ThenBB = Br->getSuccessor(0);
7046       BasicBlock *ElseBB = Br->getSuccessor(1);
7047       // Don't considers branches leaving the loop for simplification.
7048       if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
7049         continue;
7050       bool ThenEmpty = IsEmptyBlock(ThenBB);
7051       bool ElseEmpty = IsEmptyBlock(ElseBB);
7052       if ((ThenEmpty && ElseEmpty) ||
7053           (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
7054            ElseBB->phis().empty()) ||
7055           (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
7056            ThenBB->phis().empty())) {
7057         VecValuesToIgnore.insert(Br);
7058         DeadOps.push_back(Br->getCondition());
7059       }
7060       continue;
7061     }
7062 
7063     // Skip any op that shouldn't be considered dead.
7064     if (!Op || !TheLoop->contains(Op) ||
7065         (isa<PHINode>(Op) && Op->getParent() == Header) ||
7066         !wouldInstructionBeTriviallyDead(Op, TLI) ||
7067         any_of(Op->users(), [this, IsLiveOutDead](User *U) {
7068           return !VecValuesToIgnore.contains(U) &&
7069                  !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
7070         }))
7071       continue;
7072 
7073     if (!TheLoop->contains(Op->getParent()))
7074       continue;
7075 
7076     // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
7077     // which applies for both scalar and vector versions. Otherwise it is only
7078     // dead in vector versions, so only add it to VecValuesToIgnore.
7079     if (all_of(Op->users(),
7080                [this](User *U) { return ValuesToIgnore.contains(U); }))
7081       ValuesToIgnore.insert(Op);
7082 
7083     VecValuesToIgnore.insert(Op);
7084     DeadOps.append(Op->op_begin(), Op->op_end());
7085   }
7086 
7087   // Ignore type-promoting instructions we identified during reduction
7088   // detection.
7089   for (const auto &Reduction : Legal->getReductionVars()) {
7090     const RecurrenceDescriptor &RedDes = Reduction.second;
7091     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7092     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7093   }
7094   // Ignore type-casting instructions we identified during induction
7095   // detection.
7096   for (const auto &Induction : Legal->getInductionVars()) {
7097     const InductionDescriptor &IndDes = Induction.second;
7098     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7099     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7100   }
7101 }
7102 
7103 void LoopVectorizationCostModel::collectInLoopReductions() {
7104   for (const auto &Reduction : Legal->getReductionVars()) {
7105     PHINode *Phi = Reduction.first;
7106     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7107 
7108     // We don't collect reductions that are type promoted (yet).
7109     if (RdxDesc.getRecurrenceType() != Phi->getType())
7110       continue;
7111 
7112     // If the target would prefer this reduction to happen "in-loop", then we
7113     // want to record it as such.
7114     unsigned Opcode = RdxDesc.getOpcode();
7115     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7116         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7117                                    TargetTransformInfo::ReductionFlags()))
7118       continue;
7119 
7120     // Check that we can correctly put the reductions into the loop, by
7121     // finding the chain of operations that leads from the phi to the loop
7122     // exit value.
7123     SmallVector<Instruction *, 4> ReductionOperations =
7124         RdxDesc.getReductionOpChain(Phi, TheLoop);
7125     bool InLoop = !ReductionOperations.empty();
7126 
7127     if (InLoop) {
7128       InLoopReductions.insert(Phi);
7129       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7130       Instruction *LastChain = Phi;
7131       for (auto *I : ReductionOperations) {
7132         InLoopReductionImmediateChains[I] = LastChain;
7133         LastChain = I;
7134       }
7135     }
7136     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7137                       << " reduction for phi: " << *Phi << "\n");
7138   }
7139 }
7140 
7141 // This function will select a scalable VF if the target supports scalable
7142 // vectors and a fixed one otherwise.
7143 // TODO: we could return a pair of values that specify the max VF and
7144 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7145 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7146 // doesn't have a cost model that can choose which plan to execute if
7147 // more than one is generated.
7148 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7149                                      LoopVectorizationCostModel &CM) {
7150   unsigned WidestType;
7151   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7152 
7153   TargetTransformInfo::RegisterKind RegKind =
7154       TTI.enableScalableVectorization()
7155           ? TargetTransformInfo::RGK_ScalableVector
7156           : TargetTransformInfo::RGK_FixedWidthVector;
7157 
7158   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7159   unsigned N = RegSize.getKnownMinValue() / WidestType;
7160   return ElementCount::get(N, RegSize.isScalable());
7161 }
7162 
7163 VectorizationFactor
7164 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7165   ElementCount VF = UserVF;
7166   // Outer loop handling: They may require CFG and instruction level
7167   // transformations before even evaluating whether vectorization is profitable.
7168   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7169   // the vectorization pipeline.
7170   if (!OrigLoop->isInnermost()) {
7171     // If the user doesn't provide a vectorization factor, determine a
7172     // reasonable one.
7173     if (UserVF.isZero()) {
7174       VF = determineVPlanVF(TTI, CM);
7175       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7176 
7177       // Make sure we have a VF > 1 for stress testing.
7178       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7179         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7180                           << "overriding computed VF.\n");
7181         VF = ElementCount::getFixed(4);
7182       }
7183     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7184                !ForceTargetSupportsScalableVectors) {
7185       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7186                         << "not supported by the target.\n");
7187       reportVectorizationFailure(
7188           "Scalable vectorization requested but not supported by the target",
7189           "the scalable user-specified vectorization width for outer-loop "
7190           "vectorization cannot be used because the target does not support "
7191           "scalable vectors.",
7192           "ScalableVFUnfeasible", ORE, OrigLoop);
7193       return VectorizationFactor::Disabled();
7194     }
7195     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7196     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7197            "VF needs to be a power of two");
7198     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7199                       << "VF " << VF << " to build VPlans.\n");
7200     buildVPlans(VF, VF);
7201 
7202     // For VPlan build stress testing, we bail out after VPlan construction.
7203     if (VPlanBuildStressTest)
7204       return VectorizationFactor::Disabled();
7205 
7206     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7207   }
7208 
7209   LLVM_DEBUG(
7210       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7211                 "VPlan-native path.\n");
7212   return VectorizationFactor::Disabled();
7213 }
7214 
7215 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7216   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7217   CM.collectValuesToIgnore();
7218   CM.collectElementTypesForWidening();
7219 
7220   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7221   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7222     return;
7223 
7224   // Invalidate interleave groups if all blocks of loop will be predicated.
7225   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7226       !useMaskedInterleavedAccesses(TTI)) {
7227     LLVM_DEBUG(
7228         dbgs()
7229         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7230            "which requires masked-interleaved support.\n");
7231     if (CM.InterleaveInfo.invalidateGroups())
7232       // Invalidating interleave groups also requires invalidating all decisions
7233       // based on them, which includes widening decisions and uniform and scalar
7234       // values.
7235       CM.invalidateCostModelingDecisions();
7236   }
7237 
7238   if (CM.foldTailByMasking())
7239     Legal->prepareToFoldTailByMasking();
7240 
7241   ElementCount MaxUserVF =
7242       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7243   if (UserVF) {
7244     if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7245       reportVectorizationInfo(
7246           "UserVF ignored because it may be larger than the maximal safe VF",
7247           "InvalidUserVF", ORE, OrigLoop);
7248     } else {
7249       assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7250              "VF needs to be a power of two");
7251       // Collect the instructions (and their associated costs) that will be more
7252       // profitable to scalarize.
7253       CM.collectInLoopReductions();
7254       if (CM.selectUserVectorizationFactor(UserVF)) {
7255         LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7256         buildVPlansWithVPRecipes(UserVF, UserVF);
7257         LLVM_DEBUG(printPlans(dbgs()));
7258         return;
7259       }
7260       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7261                               "InvalidCost", ORE, OrigLoop);
7262     }
7263   }
7264 
7265   // Collect the Vectorization Factor Candidates.
7266   SmallVector<ElementCount> VFCandidates;
7267   for (auto VF = ElementCount::getFixed(1);
7268        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7269     VFCandidates.push_back(VF);
7270   for (auto VF = ElementCount::getScalable(1);
7271        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7272     VFCandidates.push_back(VF);
7273 
7274   CM.collectInLoopReductions();
7275   for (const auto &VF : VFCandidates) {
7276     // Collect Uniform and Scalar instructions after vectorization with VF.
7277     CM.collectUniformsAndScalars(VF);
7278 
7279     // Collect the instructions (and their associated costs) that will be more
7280     // profitable to scalarize.
7281     if (VF.isVector())
7282       CM.collectInstsToScalarize(VF);
7283   }
7284 
7285   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7286   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7287 
7288   LLVM_DEBUG(printPlans(dbgs()));
7289 }
7290 
7291 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7292                                              ElementCount VF) const {
7293   if (ForceTargetInstructionCost.getNumOccurrences())
7294     return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7295   return CM.getInstructionCost(UI, VF);
7296 }
7297 
7298 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7299   return CM.ValuesToIgnore.contains(UI) ||
7300          (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7301          SkipCostComputation.contains(UI);
7302 }
7303 
7304 InstructionCost
7305 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7306                                           VPCostContext &CostCtx) const {
7307   InstructionCost Cost;
7308   // Cost modeling for inductions is inaccurate in the legacy cost model
7309   // compared to the recipes that are generated. To match here initially during
7310   // VPlan cost model bring up directly use the induction costs from the legacy
7311   // cost model. Note that we do this as pre-processing; the VPlan may not have
7312   // any recipes associated with the original induction increment instruction
7313   // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7314   // the cost of induction phis and increments (both that are represented by
7315   // recipes and those that are not), to avoid distinguishing between them here,
7316   // and skip all recipes that represent induction phis and increments (the
7317   // former case) later on, if they exist, to avoid counting them twice.
7318   // Similarly we pre-compute the cost of any optimized truncates.
7319   // TODO: Switch to more accurate costing based on VPlan.
7320   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7321     Instruction *IVInc = cast<Instruction>(
7322         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7323     SmallVector<Instruction *> IVInsts = {IVInc};
7324     for (unsigned I = 0; I != IVInsts.size(); I++) {
7325       for (Value *Op : IVInsts[I]->operands()) {
7326         auto *OpI = dyn_cast<Instruction>(Op);
7327         if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7328           continue;
7329         IVInsts.push_back(OpI);
7330       }
7331     }
7332     IVInsts.push_back(IV);
7333     for (User *U : IV->users()) {
7334       auto *CI = cast<Instruction>(U);
7335       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7336         continue;
7337       IVInsts.push_back(CI);
7338     }
7339 
7340     // If the vector loop gets executed exactly once with the given VF, ignore
7341     // the costs of comparison and induction instructions, as they'll get
7342     // simplified away.
7343     // TODO: Remove this code after stepping away from the legacy cost model and
7344     // adding code to simplify VPlans before calculating their costs.
7345     auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7346     if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7347       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
7348                                            CostCtx.SkipCostComputation);
7349 
7350     for (Instruction *IVInst : IVInsts) {
7351       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7352         continue;
7353       InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7354       LLVM_DEBUG({
7355         dbgs() << "Cost of " << InductionCost << " for VF " << VF
7356                << ": induction instruction " << *IVInst << "\n";
7357       });
7358       Cost += InductionCost;
7359       CostCtx.SkipCostComputation.insert(IVInst);
7360     }
7361   }
7362 
7363   /// Compute the cost of all exiting conditions of the loop using the legacy
7364   /// cost model. This is to match the legacy behavior, which adds the cost of
7365   /// all exit conditions. Note that this over-estimates the cost, as there will
7366   /// be a single condition to control the vector loop.
7367   SmallVector<BasicBlock *> Exiting;
7368   CM.TheLoop->getExitingBlocks(Exiting);
7369   SetVector<Instruction *> ExitInstrs;
7370   // Collect all exit conditions.
7371   for (BasicBlock *EB : Exiting) {
7372     auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7373     if (!Term)
7374       continue;
7375     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7376       ExitInstrs.insert(CondI);
7377     }
7378   }
7379   // Compute the cost of all instructions only feeding the exit conditions.
7380   for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7381     Instruction *CondI = ExitInstrs[I];
7382     if (!OrigLoop->contains(CondI) ||
7383         !CostCtx.SkipCostComputation.insert(CondI).second)
7384       continue;
7385     InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7386     LLVM_DEBUG({
7387       dbgs() << "Cost of " << CondICost << " for VF " << VF
7388              << ": exit condition instruction " << *CondI << "\n";
7389     });
7390     Cost += CondICost;
7391     for (Value *Op : CondI->operands()) {
7392       auto *OpI = dyn_cast<Instruction>(Op);
7393       if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7394             return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7395                    !ExitInstrs.contains(cast<Instruction>(U));
7396           }))
7397         continue;
7398       ExitInstrs.insert(OpI);
7399     }
7400   }
7401 
7402   // The legacy cost model has special logic to compute the cost of in-loop
7403   // reductions, which may be smaller than the sum of all instructions involved
7404   // in the reduction.
7405   // TODO: Switch to costing based on VPlan once the logic has been ported.
7406   for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7407     if (ForceTargetInstructionCost.getNumOccurrences())
7408       continue;
7409 
7410     if (!CM.isInLoopReduction(RedPhi))
7411       continue;
7412 
7413     const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7414     SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7415                                                  ChainOps.end());
7416     auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7417       return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7418     };
7419     // Also include the operands of instructions in the chain, as the cost-model
7420     // may mark extends as free.
7421     //
7422     // For ARM, some of the instruction can folded into the reducion
7423     // instruction. So we need to mark all folded instructions free.
7424     // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7425     // instruction.
7426     for (auto *ChainOp : ChainOps) {
7427       for (Value *Op : ChainOp->operands()) {
7428         if (auto *I = dyn_cast<Instruction>(Op)) {
7429           ChainOpsAndOperands.insert(I);
7430           if (I->getOpcode() == Instruction::Mul) {
7431             auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7432             auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7433             if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7434                 Ext0->getOpcode() == Ext1->getOpcode()) {
7435               ChainOpsAndOperands.insert(Ext0);
7436               ChainOpsAndOperands.insert(Ext1);
7437             }
7438           }
7439         }
7440       }
7441     }
7442 
7443     // Pre-compute the cost for I, if it has a reduction pattern cost.
7444     for (Instruction *I : ChainOpsAndOperands) {
7445       auto ReductionCost = CM.getReductionPatternCost(
7446           I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7447       if (!ReductionCost)
7448         continue;
7449 
7450       assert(!CostCtx.SkipCostComputation.contains(I) &&
7451              "reduction op visited multiple times");
7452       CostCtx.SkipCostComputation.insert(I);
7453       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7454                         << ":\n in-loop reduction " << *I << "\n");
7455       Cost += *ReductionCost;
7456     }
7457   }
7458 
7459   // Pre-compute the costs for branches except for the backedge, as the number
7460   // of replicate regions in a VPlan may not directly match the number of
7461   // branches, which would lead to different decisions.
7462   // TODO: Compute cost of branches for each replicate region in the VPlan,
7463   // which is more accurate than the legacy cost model.
7464   for (BasicBlock *BB : OrigLoop->blocks()) {
7465     if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7466       continue;
7467     CostCtx.SkipCostComputation.insert(BB->getTerminator());
7468     if (BB == OrigLoop->getLoopLatch())
7469       continue;
7470     auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7471     Cost += BranchCost;
7472   }
7473 
7474   // Pre-compute costs for instructions that are forced-scalar or profitable to
7475   // scalarize. Their costs will be computed separately in the legacy cost
7476   // model.
7477   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7478     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7479       continue;
7480     CostCtx.SkipCostComputation.insert(ForcedScalar);
7481     InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7482     LLVM_DEBUG({
7483       dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7484              << ": forced scalar " << *ForcedScalar << "\n";
7485     });
7486     Cost += ForcedCost;
7487   }
7488   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7489     if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7490       continue;
7491     CostCtx.SkipCostComputation.insert(Scalarized);
7492     LLVM_DEBUG({
7493       dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7494              << ": profitable to scalarize " << *Scalarized << "\n";
7495     });
7496     Cost += ScalarCost;
7497   }
7498 
7499   return Cost;
7500 }
7501 
7502 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7503                                                ElementCount VF) const {
7504   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7505   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7506 
7507   // Now compute and add the VPlan-based cost.
7508   Cost += Plan.cost(VF, CostCtx);
7509 #ifndef NDEBUG
7510   unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7511   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7512                     << " (Estimated cost per lane: ");
7513   if (Cost.isValid()) {
7514     double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7515     LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7516   } else /* No point dividing an invalid cost - it will still be invalid */
7517     LLVM_DEBUG(dbgs() << "Invalid");
7518   LLVM_DEBUG(dbgs() << ")\n");
7519 #endif
7520   return Cost;
7521 }
7522 
7523 #ifndef NDEBUG
7524 /// Return true if the original loop \ TheLoop contains any instructions that do
7525 /// not have corresponding recipes in \p Plan and are not marked to be ignored
7526 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
7527 /// cost-model did not account for.
7528 static bool planContainsAdditionalSimplifications(VPlan &Plan,
7529                                                   VPCostContext &CostCtx,
7530                                                   Loop *TheLoop) {
7531   // First collect all instructions for the recipes in Plan.
7532   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7533     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7534       return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7535     if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7536       return &WidenMem->getIngredient();
7537     return nullptr;
7538   };
7539 
7540   DenseSet<Instruction *> SeenInstrs;
7541   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7542   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7543     for (VPRecipeBase &R : *VPBB) {
7544       if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7545         auto *IG = IR->getInterleaveGroup();
7546         unsigned NumMembers = IG->getNumMembers();
7547         for (unsigned I = 0; I != NumMembers; ++I) {
7548           if (Instruction *M = IG->getMember(I))
7549             SeenInstrs.insert(M);
7550         }
7551         continue;
7552       }
7553       // The VPlan-based cost model is more accurate for partial reduction and
7554       // comparing against the legacy cost isn't desirable.
7555       if (isa<VPPartialReductionRecipe>(&R))
7556         return true;
7557       if (Instruction *UI = GetInstructionForCost(&R))
7558         SeenInstrs.insert(UI);
7559     }
7560   }
7561 
7562   // Return true if the loop contains any instructions that are not also part of
7563   // the VPlan or are skipped for VPlan-based cost computations. This indicates
7564   // that the VPlan contains extra simplifications.
7565   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7566                                     TheLoop](BasicBlock *BB) {
7567     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7568       if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7569         return false;
7570       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7571     });
7572   });
7573 }
7574 #endif
7575 
7576 VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7577   if (VPlans.empty())
7578     return VectorizationFactor::Disabled();
7579   // If there is a single VPlan with a single VF, return it directly.
7580   VPlan &FirstPlan = *VPlans[0];
7581   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7582     return {*FirstPlan.vectorFactors().begin(), 0, 0};
7583 
7584   ElementCount ScalarVF = ElementCount::getFixed(1);
7585   assert(hasPlanWithVF(ScalarVF) &&
7586          "More than a single plan/VF w/o any plan having scalar VF");
7587 
7588   // TODO: Compute scalar cost using VPlan-based cost model.
7589   InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7590   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7591   VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7592   VectorizationFactor BestFactor = ScalarFactor;
7593 
7594   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7595   if (ForceVectorization) {
7596     // Ignore scalar width, because the user explicitly wants vectorization.
7597     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7598     // evaluation.
7599     BestFactor.Cost = InstructionCost::getMax();
7600   }
7601 
7602   for (auto &P : VPlans) {
7603     for (ElementCount VF : P->vectorFactors()) {
7604       if (VF.isScalar())
7605         continue;
7606       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7607         LLVM_DEBUG(
7608             dbgs()
7609             << "LV: Not considering vector loop of width " << VF
7610             << " because it will not generate any vector instructions.\n");
7611         continue;
7612       }
7613 
7614       InstructionCost Cost = cost(*P, VF);
7615       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7616       if (isMoreProfitable(CurrentFactor, BestFactor))
7617         BestFactor = CurrentFactor;
7618 
7619       // If profitable add it to ProfitableVF list.
7620       if (isMoreProfitable(CurrentFactor, ScalarFactor))
7621         ProfitableVFs.push_back(CurrentFactor);
7622     }
7623   }
7624 
7625 #ifndef NDEBUG
7626   // Select the optimal vectorization factor according to the legacy cost-model.
7627   // This is now only used to verify the decisions by the new VPlan-based
7628   // cost-model and will be retired once the VPlan-based cost-model is
7629   // stabilized.
7630   VectorizationFactor LegacyVF = selectVectorizationFactor();
7631   VPlan &BestPlan = getPlanFor(BestFactor.Width);
7632 
7633   // Pre-compute the cost and use it to check if BestPlan contains any
7634   // simplifications not accounted for in the legacy cost model. If that's the
7635   // case, don't trigger the assertion, as the extra simplifications may cause a
7636   // different VF to be picked by the VPlan-based cost model.
7637   VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7638   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7639   assert((BestFactor.Width == LegacyVF.Width ||
7640           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7641                                                 CostCtx, OrigLoop) ||
7642           planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
7643                                                 CostCtx, OrigLoop)) &&
7644          " VPlan cost model and legacy cost model disagreed");
7645   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7646          "when vectorizing, the scalar cost must be computed.");
7647 #endif
7648 
7649   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7650   return BestFactor;
7651 }
7652 
7653 static void addRuntimeUnrollDisableMetaData(Loop *L) {
7654   SmallVector<Metadata *, 4> MDs;
7655   // Reserve first location for self reference to the LoopID metadata node.
7656   MDs.push_back(nullptr);
7657   bool IsUnrollMetadata = false;
7658   MDNode *LoopID = L->getLoopID();
7659   if (LoopID) {
7660     // First find existing loop unrolling disable metadata.
7661     for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7662       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7663       if (MD) {
7664         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7665         IsUnrollMetadata =
7666             S && S->getString().starts_with("llvm.loop.unroll.disable");
7667       }
7668       MDs.push_back(LoopID->getOperand(I));
7669     }
7670   }
7671 
7672   if (!IsUnrollMetadata) {
7673     // Add runtime unroll disable metadata.
7674     LLVMContext &Context = L->getHeader()->getContext();
7675     SmallVector<Metadata *, 1> DisableOperands;
7676     DisableOperands.push_back(
7677         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7678     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7679     MDs.push_back(DisableNode);
7680     MDNode *NewLoopID = MDNode::get(Context, MDs);
7681     // Set operand 0 to refer to the loop id itself.
7682     NewLoopID->replaceOperandWith(0, NewLoopID);
7683     L->setLoopID(NewLoopID);
7684   }
7685 }
7686 
7687 // If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7688 // fix the reduction's scalar PHI node by adding the incoming value from the
7689 // main vector loop.
7690 static void fixReductionScalarResumeWhenVectorizingEpilog(
7691     VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7692     BasicBlock *BypassBlock) {
7693   auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7694   if (!EpiRedResult ||
7695       EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7696     return;
7697 
7698   auto *EpiRedHeaderPhi =
7699       cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7700   const RecurrenceDescriptor &RdxDesc =
7701       EpiRedHeaderPhi->getRecurrenceDescriptor();
7702   Value *MainResumeValue =
7703       EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7704   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7705           RdxDesc.getRecurrenceKind())) {
7706     auto *Cmp = cast<ICmpInst>(MainResumeValue);
7707     assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7708            "AnyOf expected to start with ICMP_NE");
7709     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7710            "AnyOf expected to start by comparing main resume value to original "
7711            "start value");
7712     MainResumeValue = Cmp->getOperand(0);
7713   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
7714                  RdxDesc.getRecurrenceKind())) {
7715     using namespace llvm::PatternMatch;
7716     Value *Cmp, *OrigResumeV;
7717     bool IsExpectedPattern =
7718         match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7719                                         m_Specific(RdxDesc.getSentinelValue()),
7720                                         m_Value(OrigResumeV))) &&
7721         match(Cmp,
7722               m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7723                              m_Specific(RdxDesc.getRecurrenceStartValue())));
7724     assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7725     (void)IsExpectedPattern;
7726     MainResumeValue = OrigResumeV;
7727   }
7728   PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7729 
7730   // When fixing reductions in the epilogue loop we should already have
7731   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7732   // over the incoming values correctly.
7733   using namespace VPlanPatternMatch;
7734   auto IsResumePhi = [](VPUser *U) {
7735     return match(
7736         U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7737   };
7738   assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7739          "ResumePhi must have a single user");
7740   auto *EpiResumePhiVPI =
7741       cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7742   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7743   EpiResumePhi->setIncomingValueForBlock(
7744       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7745 }
7746 
7747 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7748     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7749     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7750     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7751   assert(BestVPlan.hasVF(BestVF) &&
7752          "Trying to execute plan with unsupported VF");
7753   assert(BestVPlan.hasUF(BestUF) &&
7754          "Trying to execute plan with unsupported UF");
7755   assert(
7756       ((VectorizingEpilogue && ExpandedSCEVs) ||
7757        (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7758       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7759 
7760   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7761   // cost model is complete for better cost estimates.
7762   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7763                               OrigLoop->getHeader()->getContext());
7764   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7765   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
7766 
7767   // Perform the actual loop transformation.
7768   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7769                          &BestVPlan, OrigLoop->getParentLoop(),
7770                          Legal->getWidestInductionType());
7771 
7772 #ifdef EXPENSIVE_CHECKS
7773   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7774 #endif
7775 
7776   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7777   // making any changes to the CFG.
7778   if (!BestVPlan.getEntry()->empty())
7779     BestVPlan.getEntry()->execute(&State);
7780 
7781   if (!ILV.getTripCount())
7782     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7783   else
7784     assert(VectorizingEpilogue && "should only re-use the existing trip "
7785                                   "count during epilogue vectorization");
7786 
7787   // 1. Set up the skeleton for vectorization, including vector pre-header and
7788   // middle block. The vector loop is created during VPlan execution.
7789   VPBasicBlock *VectorPH =
7790       cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7791   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7792       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7793   if (VectorizingEpilogue)
7794     VPlanTransforms::removeDeadRecipes(BestVPlan);
7795 
7796   // Only use noalias metadata when using memory checks guaranteeing no overlap
7797   // across all iterations.
7798   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7799   std::unique_ptr<LoopVersioning> LVer = nullptr;
7800   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7801       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7802 
7803     //  We currently don't use LoopVersioning for the actual loop cloning but we
7804     //  still use it to add the noalias metadata.
7805     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7806     //        metadata.
7807     LVer = std::make_unique<LoopVersioning>(
7808         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7809         PSE.getSE());
7810     State.LVer = &*LVer;
7811     State.LVer->prepareNoAliasMetadata();
7812   }
7813 
7814   ILV.printDebugTracesAtStart();
7815 
7816   //===------------------------------------------------===//
7817   //
7818   // Notice: any optimization or new instruction that go
7819   // into the code below should also be implemented in
7820   // the cost-model.
7821   //
7822   //===------------------------------------------------===//
7823 
7824   // 2. Copy and widen instructions from the old loop into the new loop.
7825   BestVPlan.prepareToExecute(
7826       ILV.getTripCount(),
7827       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7828   replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7829 
7830   BestVPlan.execute(&State);
7831 
7832   auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7833   // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7834   // values from the additional bypass block.
7835   if (VectorizingEpilogue) {
7836     assert(!ILV.Legal->hasUncountableEarlyExit() &&
7837            "Epilogue vectorisation not yet supported with early exits");
7838     BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7839     for (VPRecipeBase &R : *MiddleVPBB) {
7840       fixReductionScalarResumeWhenVectorizingEpilog(
7841           &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7842     }
7843     BasicBlock *PH = OrigLoop->getLoopPreheader();
7844     for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7845       auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7846       Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
7847       Inc->setIncomingValueForBlock(BypassBlock, V);
7848     }
7849   }
7850 
7851   // 2.6. Maintain Loop Hints
7852   // Keep all loop hints from the original loop on the vector loop (we'll
7853   // replace the vectorizer-specific hints below).
7854   if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7855     MDNode *OrigLoopID = OrigLoop->getLoopID();
7856 
7857     std::optional<MDNode *> VectorizedLoopID =
7858         makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7859                                         LLVMLoopVectorizeFollowupVectorized});
7860 
7861     VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7862     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7863     if (VectorizedLoopID) {
7864       L->setLoopID(*VectorizedLoopID);
7865     } else {
7866       // Keep all loop hints from the original loop on the vector loop (we'll
7867       // replace the vectorizer-specific hints below).
7868       if (MDNode *LID = OrigLoop->getLoopID())
7869         L->setLoopID(LID);
7870 
7871       LoopVectorizeHints Hints(L, true, *ORE);
7872       Hints.setAlreadyVectorized();
7873     }
7874     TargetTransformInfo::UnrollingPreferences UP;
7875     TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7876     if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7877       addRuntimeUnrollDisableMetaData(L);
7878   }
7879 
7880   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7881   //    predication, updating analyses.
7882   ILV.fixVectorizedLoop(State);
7883 
7884   ILV.printDebugTracesAtEnd();
7885 
7886   // 4. Adjust branch weight of the branch in the middle block.
7887   if (BestVPlan.getVectorLoopRegion()) {
7888     auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7889     auto *MiddleTerm =
7890         cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7891     if (MiddleTerm->isConditional() &&
7892         hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7893       // Assume that `Count % VectorTripCount` is equally distributed.
7894       unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7895       assert(TripCount > 0 && "trip count should not be zero");
7896       const uint32_t Weights[] = {1, TripCount - 1};
7897       setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7898     }
7899   }
7900 
7901   return State.ExpandedSCEVs;
7902 }
7903 
7904 //===--------------------------------------------------------------------===//
7905 // EpilogueVectorizerMainLoop
7906 //===--------------------------------------------------------------------===//
7907 
7908 /// This function is partially responsible for generating the control flow
7909 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7910 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7911     const SCEV2ValueTy &ExpandedSCEVs) {
7912   createVectorLoopSkeleton("");
7913 
7914   // Generate the code to check the minimum iteration count of the vector
7915   // epilogue (see below).
7916   EPI.EpilogueIterationCountCheck =
7917       emitIterationCountCheck(LoopScalarPreHeader, true);
7918   EPI.EpilogueIterationCountCheck->setName("iter.check");
7919 
7920   // Generate the code to check any assumptions that we've made for SCEV
7921   // expressions.
7922   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7923 
7924   // Generate the code that checks at runtime if arrays overlap. We put the
7925   // checks into a separate block to make the more common case of few elements
7926   // faster.
7927   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7928 
7929   // Generate the iteration count check for the main loop, *after* the check
7930   // for the epilogue loop, so that the path-length is shorter for the case
7931   // that goes directly through the vector epilogue. The longer-path length for
7932   // the main loop is compensated for, by the gain from vectorizing the larger
7933   // trip count. Note: the branch will get updated later on when we vectorize
7934   // the epilogue.
7935   EPI.MainLoopIterationCountCheck =
7936       emitIterationCountCheck(LoopScalarPreHeader, false);
7937 
7938   // Generate the induction variable.
7939   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7940 
7941   return LoopVectorPreHeader;
7942 }
7943 
7944 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7945   LLVM_DEBUG({
7946     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7947            << "Main Loop VF:" << EPI.MainLoopVF
7948            << ", Main Loop UF:" << EPI.MainLoopUF
7949            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7950            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7951   });
7952 }
7953 
7954 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7955   DEBUG_WITH_TYPE(VerboseDebug, {
7956     dbgs() << "intermediate fn:\n"
7957            << *OrigLoop->getHeader()->getParent() << "\n";
7958   });
7959 }
7960 
7961 BasicBlock *
7962 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7963                                                     bool ForEpilogue) {
7964   assert(Bypass && "Expected valid bypass basic block.");
7965   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7966   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7967   Value *Count = getTripCount();
7968   // Reuse existing vector loop preheader for TC checks.
7969   // Note that new preheader block is generated for vector loop.
7970   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7971   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7972 
7973   // Generate code to check if the loop's trip count is less than VF * UF of the
7974   // main vector loop.
7975   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7976                                                     : VF.isVector())
7977                ? ICmpInst::ICMP_ULE
7978                : ICmpInst::ICMP_ULT;
7979 
7980   Value *CheckMinIters = Builder.CreateICmp(
7981       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7982       "min.iters.check");
7983 
7984   if (!ForEpilogue)
7985     TCCheckBlock->setName("vector.main.loop.iter.check");
7986 
7987   // Create new preheader for vector loop.
7988   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7989                                    DT, LI, nullptr, "vector.ph");
7990 
7991   if (ForEpilogue) {
7992     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7993                                  DT->getNode(Bypass)->getIDom()) &&
7994            "TC check is expected to dominate Bypass");
7995 
7996     LoopBypassBlocks.push_back(TCCheckBlock);
7997 
7998     // Save the trip count so we don't have to regenerate it in the
7999     // vec.epilog.iter.check. This is safe to do because the trip count
8000     // generated here dominates the vector epilog iter check.
8001     EPI.TripCount = Count;
8002   }
8003 
8004   BranchInst &BI =
8005       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8006   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
8007     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
8008   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
8009 
8010   introduceCheckBlockInVPlan(TCCheckBlock);
8011   return TCCheckBlock;
8012 }
8013 
8014 //===--------------------------------------------------------------------===//
8015 // EpilogueVectorizerEpilogueLoop
8016 //===--------------------------------------------------------------------===//
8017 
8018 /// This function is partially responsible for generating the control flow
8019 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8020 BasicBlock *
8021 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
8022     const SCEV2ValueTy &ExpandedSCEVs) {
8023   createVectorLoopSkeleton("vec.epilog.");
8024 
8025   // Now, compare the remaining count and if there aren't enough iterations to
8026   // execute the vectorized epilogue skip to the scalar part.
8027   LoopVectorPreHeader->setName("vec.epilog.ph");
8028   BasicBlock *VecEpilogueIterationCountCheck =
8029       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
8030                  nullptr, "vec.epilog.iter.check", true);
8031   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
8032                                           VecEpilogueIterationCountCheck);
8033   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
8034 
8035   // Adjust the control flow taking the state info from the main loop
8036   // vectorization into account.
8037   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8038          "expected this to be saved from the previous pass.");
8039   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8040       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8041 
8042   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8043       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8044 
8045   if (EPI.SCEVSafetyCheck)
8046     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8047         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8048   if (EPI.MemSafetyCheck)
8049     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8050         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8051 
8052   DT->changeImmediateDominator(LoopScalarPreHeader,
8053                                EPI.EpilogueIterationCountCheck);
8054   // Keep track of bypass blocks, as they feed start values to the induction and
8055   // reduction phis in the scalar loop preheader.
8056   if (EPI.SCEVSafetyCheck)
8057     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8058   if (EPI.MemSafetyCheck)
8059     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8060   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8061 
8062   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
8063   // reductions which merge control-flow from the latch block and the middle
8064   // block. Update the incoming values here and move the Phi into the preheader.
8065   SmallVector<PHINode *, 4> PhisInBlock;
8066   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8067     PhisInBlock.push_back(&Phi);
8068 
8069   for (PHINode *Phi : PhisInBlock) {
8070     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8071     Phi->replaceIncomingBlockWith(
8072         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8073         VecEpilogueIterationCountCheck);
8074 
8075     // If the phi doesn't have an incoming value from the
8076     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
8077     // value and also those from other check blocks. This is needed for
8078     // reduction phis only.
8079     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
8080           return EPI.EpilogueIterationCountCheck == IncB;
8081         }))
8082       continue;
8083     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8084     if (EPI.SCEVSafetyCheck)
8085       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8086     if (EPI.MemSafetyCheck)
8087       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8088   }
8089 
8090   // Generate bypass values from the additional bypass block. Note that when the
8091   // vectorized epilogue is skipped due to iteration count check, then the
8092   // resume value for the induction variable comes from the trip count of the
8093   // main vector loop, passed as the second argument.
8094   createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
8095   return LoopVectorPreHeader;
8096 }
8097 
8098 BasicBlock *
8099 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8100     BasicBlock *Bypass, BasicBlock *Insert) {
8101 
8102   assert(EPI.TripCount &&
8103          "Expected trip count to have been saved in the first pass.");
8104   assert(
8105       (!isa<Instruction>(EPI.TripCount) ||
8106        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8107       "saved trip count does not dominate insertion point.");
8108   Value *TC = EPI.TripCount;
8109   IRBuilder<> Builder(Insert->getTerminator());
8110   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8111 
8112   // Generate code to check if the loop's trip count is less than VF * UF of the
8113   // vector epilogue loop.
8114   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8115                ? ICmpInst::ICMP_ULE
8116                : ICmpInst::ICMP_ULT;
8117 
8118   Value *CheckMinIters =
8119       Builder.CreateICmp(P, Count,
8120                          createStepForVF(Builder, Count->getType(),
8121                                          EPI.EpilogueVF, EPI.EpilogueUF),
8122                          "min.epilog.iters.check");
8123 
8124   BranchInst &BI =
8125       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8126   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
8127     unsigned MainLoopStep = UF * VF.getKnownMinValue();
8128     unsigned EpilogueLoopStep =
8129         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
8130     // We assume the remaining `Count` is equally distributed in
8131     // [0, MainLoopStep)
8132     // So the probability for `Count < EpilogueLoopStep` should be
8133     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8134     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8135     const uint32_t Weights[] = {EstimatedSkipCount,
8136                                 MainLoopStep - EstimatedSkipCount};
8137     setBranchWeights(BI, Weights, /*IsExpected=*/false);
8138   }
8139   ReplaceInstWithInst(Insert->getTerminator(), &BI);
8140   LoopBypassBlocks.push_back(Insert);
8141 
8142   // A new entry block has been created for the epilogue VPlan. Hook it in, as
8143   // otherwise we would try to modify the entry to the main vector loop.
8144   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8145   VPBasicBlock *OldEntry = Plan.getEntry();
8146   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8147   Plan.setEntry(NewEntry);
8148   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8149 
8150   introduceCheckBlockInVPlan(Insert);
8151   return Insert;
8152 }
8153 
8154 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8155   LLVM_DEBUG({
8156     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8157            << "Epilogue Loop VF:" << EPI.EpilogueVF
8158            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8159   });
8160 }
8161 
8162 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8163   DEBUG_WITH_TYPE(VerboseDebug, {
8164     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8165   });
8166 }
8167 
8168 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8169 VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
8170   std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8171     return getVPValueOrAddLiveIn(Op);
8172   };
8173   return map_range(Operands, Fn);
8174 }
8175 
8176 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
8177   BasicBlock *Src = SI->getParent();
8178   assert(!OrigLoop->isLoopExiting(Src) &&
8179          all_of(successors(Src),
8180                 [this](BasicBlock *Succ) {
8181                   return OrigLoop->getHeader() != Succ;
8182                 }) &&
8183          "unsupported switch either exiting loop or continuing to header");
8184   // Create masks where the terminator in Src is a switch. We create mask for
8185   // all edges at the same time. This is more efficient, as we can create and
8186   // collect compares for all cases once.
8187   VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8188   BasicBlock *DefaultDst = SI->getDefaultDest();
8189   MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
8190   for (auto &C : SI->cases()) {
8191     BasicBlock *Dst = C.getCaseSuccessor();
8192     assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8193     // Cases whose destination is the same as default are redundant and can be
8194     // ignored - they will get there anyhow.
8195     if (Dst == DefaultDst)
8196       continue;
8197     auto &Compares = Dst2Compares[Dst];
8198     VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8199     Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8200   }
8201 
8202   // We need to handle 2 separate cases below for all entries in Dst2Compares,
8203   // which excludes destinations matching the default destination.
8204   VPValue *SrcMask = getBlockInMask(Src);
8205   VPValue *DefaultMask = nullptr;
8206   for (const auto &[Dst, Conds] : Dst2Compares) {
8207     // 1. Dst is not the default destination. Dst is reached if any of the cases
8208     // with destination == Dst are taken. Join the conditions for each case
8209     // whose destination == Dst using an OR.
8210     VPValue *Mask = Conds[0];
8211     for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8212       Mask = Builder.createOr(Mask, V);
8213     if (SrcMask)
8214       Mask = Builder.createLogicalAnd(SrcMask, Mask);
8215     EdgeMaskCache[{Src, Dst}] = Mask;
8216 
8217     // 2. Create the mask for the default destination, which is reached if none
8218     // of the cases with destination != default destination are taken. Join the
8219     // conditions for each case where the destination is != Dst using an OR and
8220     // negate it.
8221     DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8222   }
8223 
8224   if (DefaultMask) {
8225     DefaultMask = Builder.createNot(DefaultMask);
8226     if (SrcMask)
8227       DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8228   }
8229   EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8230 }
8231 
8232 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
8233   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8234 
8235   // Look for cached value.
8236   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8237   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8238   if (ECEntryIt != EdgeMaskCache.end())
8239     return ECEntryIt->second;
8240 
8241   if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8242     createSwitchEdgeMasks(SI);
8243     assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8244     return EdgeMaskCache[Edge];
8245   }
8246 
8247   VPValue *SrcMask = getBlockInMask(Src);
8248 
8249   // The terminator has to be a branch inst!
8250   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8251   assert(BI && "Unexpected terminator found");
8252   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8253     return EdgeMaskCache[Edge] = SrcMask;
8254 
8255   // If source is an exiting block, we know the exit edge is dynamically dead
8256   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8257   // adding uses of an otherwise potentially dead instruction unless we are
8258   // vectorizing a loop with uncountable exits. In that case, we always
8259   // materialize the mask.
8260   if (OrigLoop->isLoopExiting(Src) &&
8261       Src != Legal->getUncountableEarlyExitingBlock())
8262     return EdgeMaskCache[Edge] = SrcMask;
8263 
8264   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8265   assert(EdgeMask && "No Edge Mask found for condition");
8266 
8267   if (BI->getSuccessor(0) != Dst)
8268     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8269 
8270   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8271     // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8272     // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8273     // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8274     EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8275   }
8276 
8277   return EdgeMaskCache[Edge] = EdgeMask;
8278 }
8279 
8280 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
8281   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8282 
8283   // Look for cached value.
8284   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8285   EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8286   assert(ECEntryIt != EdgeMaskCache.end() &&
8287          "looking up mask for edge which has not been created");
8288   return ECEntryIt->second;
8289 }
8290 
8291 void VPRecipeBuilder::createHeaderMask() {
8292   BasicBlock *Header = OrigLoop->getHeader();
8293 
8294   // When not folding the tail, use nullptr to model all-true mask.
8295   if (!CM.foldTailByMasking()) {
8296     BlockMaskCache[Header] = nullptr;
8297     return;
8298   }
8299 
8300   // Introduce the early-exit compare IV <= BTC to form header block mask.
8301   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8302   // constructing the desired canonical IV in the header block as its first
8303   // non-phi instructions.
8304 
8305   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8306   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8307   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8308   HeaderVPBB->insert(IV, NewInsertionPoint);
8309 
8310   VPBuilder::InsertPointGuard Guard(Builder);
8311   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8312   VPValue *BlockMask = nullptr;
8313   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8314   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8315   BlockMaskCache[Header] = BlockMask;
8316 }
8317 
8318 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8319   // Return the cached value.
8320   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8321   assert(BCEntryIt != BlockMaskCache.end() &&
8322          "Trying to access mask for block without one.");
8323   return BCEntryIt->second;
8324 }
8325 
8326 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
8327   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8328   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8329   assert(OrigLoop->getHeader() != BB &&
8330          "Loop header must have cached block mask");
8331 
8332   // All-one mask is modelled as no-mask following the convention for masked
8333   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8334   VPValue *BlockMask = nullptr;
8335   // This is the block mask. We OR all unique incoming edges.
8336   for (auto *Predecessor :
8337        SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) {
8338     VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8339     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8340       BlockMaskCache[BB] = EdgeMask;
8341       return;
8342     }
8343 
8344     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8345       BlockMask = EdgeMask;
8346       continue;
8347     }
8348 
8349     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8350   }
8351 
8352   BlockMaskCache[BB] = BlockMask;
8353 }
8354 
8355 VPWidenMemoryRecipe *
8356 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8357                                   VFRange &Range) {
8358   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8359          "Must be called with either a load or store");
8360 
8361   auto WillWiden = [&](ElementCount VF) -> bool {
8362     LoopVectorizationCostModel::InstWidening Decision =
8363         CM.getWideningDecision(I, VF);
8364     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8365            "CM decision should be taken at this point.");
8366     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8367       return true;
8368     if (CM.isScalarAfterVectorization(I, VF) ||
8369         CM.isProfitableToScalarize(I, VF))
8370       return false;
8371     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8372   };
8373 
8374   if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range))
8375     return nullptr;
8376 
8377   VPValue *Mask = nullptr;
8378   if (Legal->isMaskRequired(I))
8379     Mask = getBlockInMask(I->getParent());
8380 
8381   // Determine if the pointer operand of the access is either consecutive or
8382   // reverse consecutive.
8383   LoopVectorizationCostModel::InstWidening Decision =
8384       CM.getWideningDecision(I, Range.Start);
8385   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8386   bool Consecutive =
8387       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8388 
8389   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8390   if (Consecutive) {
8391     auto *GEP = dyn_cast<GetElementPtrInst>(
8392         Ptr->getUnderlyingValue()->stripPointerCasts());
8393     VPSingleDefRecipe *VectorPtr;
8394     if (Reverse) {
8395       // When folding the tail, we may compute an address that we don't in the
8396       // original scalar loop and it may not be inbounds. Drop Inbounds in that
8397       // case.
8398       GEPNoWrapFlags Flags =
8399           (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8400               ? GEPNoWrapFlags::none()
8401               : GEPNoWrapFlags::inBounds();
8402       VectorPtr = new VPReverseVectorPointerRecipe(
8403           Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8404     } else {
8405       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8406                                             GEP ? GEP->getNoWrapFlags()
8407                                                 : GEPNoWrapFlags::none(),
8408                                             I->getDebugLoc());
8409     }
8410     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8411     Ptr = VectorPtr;
8412   }
8413   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8414     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8415                                  I->getDebugLoc());
8416 
8417   StoreInst *Store = cast<StoreInst>(I);
8418   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8419                                 Reverse, I->getDebugLoc());
8420 }
8421 
8422 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8423 /// insert a recipe to expand the step for the induction recipe.
8424 static VPWidenIntOrFpInductionRecipe *
8425 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8426                             VPValue *Start, const InductionDescriptor &IndDesc,
8427                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8428   assert(IndDesc.getStartValue() ==
8429          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8430   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8431          "step must be loop invariant");
8432 
8433   VPValue *Step =
8434       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8435   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8436     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8437                                              IndDesc, TruncI,
8438                                              TruncI->getDebugLoc());
8439   }
8440   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8441   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8442                                            IndDesc, Phi->getDebugLoc());
8443 }
8444 
8445 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8446     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
8447 
8448   // Check if this is an integer or fp induction. If so, build the recipe that
8449   // produces its scalar and vector values.
8450   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8451     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8452                                        *PSE.getSE(), *OrigLoop);
8453 
8454   // Check if this is pointer induction. If so, build the recipe for it.
8455   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8456     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8457                                                            *PSE.getSE());
8458     return new VPWidenPointerInductionRecipe(
8459         Phi, Operands[0], Step, *II,
8460         LoopVectorizationPlanner::getDecisionAndClampRange(
8461             [&](ElementCount VF) {
8462               return CM.isScalarAfterVectorization(Phi, VF);
8463             },
8464             Range),
8465         Phi->getDebugLoc());
8466   }
8467   return nullptr;
8468 }
8469 
8470 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8471     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
8472   // Optimize the special case where the source is a constant integer
8473   // induction variable. Notice that we can only optimize the 'trunc' case
8474   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8475   // (c) other casts depend on pointer size.
8476 
8477   // Determine whether \p K is a truncation based on an induction variable that
8478   // can be optimized.
8479   auto IsOptimizableIVTruncate =
8480       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8481     return [=](ElementCount VF) -> bool {
8482       return CM.isOptimizableIVTruncate(K, VF);
8483     };
8484   };
8485 
8486   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8487           IsOptimizableIVTruncate(I), Range)) {
8488 
8489     auto *Phi = cast<PHINode>(I->getOperand(0));
8490     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8491     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8492     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8493                                        *OrigLoop);
8494   }
8495   return nullptr;
8496 }
8497 
8498 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8499                                            ArrayRef<VPValue *> Operands) {
8500   unsigned NumIncoming = Phi->getNumIncomingValues();
8501 
8502   // We know that all PHIs in non-header blocks are converted into selects, so
8503   // we don't have to worry about the insertion order and we can just use the
8504   // builder. At this point we generate the predication tree. There may be
8505   // duplications since this is a simple recursive scan, but future
8506   // optimizations will clean it up.
8507   SmallVector<VPValue *, 2> OperandsWithMask;
8508 
8509   for (unsigned In = 0; In < NumIncoming; In++) {
8510     OperandsWithMask.push_back(Operands[In]);
8511     VPValue *EdgeMask =
8512         getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8513     if (!EdgeMask) {
8514       assert(In == 0 && "Both null and non-null edge masks found");
8515       assert(all_equal(Operands) &&
8516              "Distinct incoming values with one having a full mask");
8517       break;
8518     }
8519     OperandsWithMask.push_back(EdgeMask);
8520   }
8521   return new VPBlendRecipe(Phi, OperandsWithMask);
8522 }
8523 
8524 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8525                                                    ArrayRef<VPValue *> Operands,
8526                                                    VFRange &Range) {
8527   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8528       [this, CI](ElementCount VF) {
8529         return CM.isScalarWithPredication(CI, VF);
8530       },
8531       Range);
8532 
8533   if (IsPredicated)
8534     return nullptr;
8535 
8536   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8537   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8538              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8539              ID == Intrinsic::pseudoprobe ||
8540              ID == Intrinsic::experimental_noalias_scope_decl))
8541     return nullptr;
8542 
8543   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8544 
8545   // Is it beneficial to perform intrinsic call compared to lib call?
8546   bool ShouldUseVectorIntrinsic =
8547       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8548                 [&](ElementCount VF) -> bool {
8549                   return CM.getCallWideningDecision(CI, VF).Kind ==
8550                          LoopVectorizationCostModel::CM_IntrinsicCall;
8551                 },
8552                 Range);
8553   if (ShouldUseVectorIntrinsic)
8554     return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8555                                       CI->getDebugLoc());
8556 
8557   Function *Variant = nullptr;
8558   std::optional<unsigned> MaskPos;
8559   // Is better to call a vectorized version of the function than to to scalarize
8560   // the call?
8561   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8562       [&](ElementCount VF) -> bool {
8563         // The following case may be scalarized depending on the VF.
8564         // The flag shows whether we can use a usual Call for vectorized
8565         // version of the instruction.
8566 
8567         // If we've found a variant at a previous VF, then stop looking. A
8568         // vectorized variant of a function expects input in a certain shape
8569         // -- basically the number of input registers, the number of lanes
8570         // per register, and whether there's a mask required.
8571         // We store a pointer to the variant in the VPWidenCallRecipe, so
8572         // once we have an appropriate variant it's only valid for that VF.
8573         // This will force a different vplan to be generated for each VF that
8574         // finds a valid variant.
8575         if (Variant)
8576           return false;
8577         LoopVectorizationCostModel::CallWideningDecision Decision =
8578             CM.getCallWideningDecision(CI, VF);
8579         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8580           Variant = Decision.Variant;
8581           MaskPos = Decision.MaskPos;
8582           return true;
8583         }
8584 
8585         return false;
8586       },
8587       Range);
8588   if (ShouldUseVectorCall) {
8589     if (MaskPos.has_value()) {
8590       // We have 2 cases that would require a mask:
8591       //   1) The block needs to be predicated, either due to a conditional
8592       //      in the scalar loop or use of an active lane mask with
8593       //      tail-folding, and we use the appropriate mask for the block.
8594       //   2) No mask is required for the block, but the only available
8595       //      vector variant at this VF requires a mask, so we synthesize an
8596       //      all-true mask.
8597       VPValue *Mask = nullptr;
8598       if (Legal->isMaskRequired(CI))
8599         Mask = getBlockInMask(CI->getParent());
8600       else
8601         Mask = Plan.getOrAddLiveIn(
8602             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
8603 
8604       Ops.insert(Ops.begin() + *MaskPos, Mask);
8605     }
8606 
8607     Ops.push_back(Operands.back());
8608     return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8609   }
8610 
8611   return nullptr;
8612 }
8613 
8614 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8615   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8616          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8617   // Instruction should be widened, unless it is scalar after vectorization,
8618   // scalarization is profitable or it is predicated.
8619   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8620     return CM.isScalarAfterVectorization(I, VF) ||
8621            CM.isProfitableToScalarize(I, VF) ||
8622            CM.isScalarWithPredication(I, VF);
8623   };
8624   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8625                                                              Range);
8626 }
8627 
8628 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8629                                            ArrayRef<VPValue *> Operands,
8630                                            VPBasicBlock *VPBB) {
8631   switch (I->getOpcode()) {
8632   default:
8633     return nullptr;
8634   case Instruction::SDiv:
8635   case Instruction::UDiv:
8636   case Instruction::SRem:
8637   case Instruction::URem: {
8638     // If not provably safe, use a select to form a safe divisor before widening the
8639     // div/rem operation itself.  Otherwise fall through to general handling below.
8640     if (CM.isPredicatedInst(I)) {
8641       SmallVector<VPValue *> Ops(Operands);
8642       VPValue *Mask = getBlockInMask(I->getParent());
8643       VPValue *One =
8644           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8645       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8646       Ops[1] = SafeRHS;
8647       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8648     }
8649     [[fallthrough]];
8650   }
8651   case Instruction::Add:
8652   case Instruction::And:
8653   case Instruction::AShr:
8654   case Instruction::FAdd:
8655   case Instruction::FCmp:
8656   case Instruction::FDiv:
8657   case Instruction::FMul:
8658   case Instruction::FNeg:
8659   case Instruction::FRem:
8660   case Instruction::FSub:
8661   case Instruction::ICmp:
8662   case Instruction::LShr:
8663   case Instruction::Mul:
8664   case Instruction::Or:
8665   case Instruction::Select:
8666   case Instruction::Shl:
8667   case Instruction::Sub:
8668   case Instruction::Xor:
8669   case Instruction::Freeze:
8670     SmallVector<VPValue *> NewOps(Operands);
8671     if (Instruction::isBinaryOp(I->getOpcode())) {
8672       // The legacy cost model uses SCEV to check if some of the operands are
8673       // constants. To match the legacy cost model's behavior, use SCEV to try
8674       // to replace operands with constants.
8675       ScalarEvolution &SE = *PSE.getSE();
8676       auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8677         Value *V = Op->getUnderlyingValue();
8678         if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8679           return Op;
8680         auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8681         if (!C)
8682           return Op;
8683         return Plan.getOrAddLiveIn(C->getValue());
8684       };
8685       // For Mul, the legacy cost model checks both operands.
8686       if (I->getOpcode() == Instruction::Mul)
8687         NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8688       // For other binops, the legacy cost model only checks the second operand.
8689       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8690     }
8691     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8692   };
8693 }
8694 
8695 VPHistogramRecipe *
8696 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8697                                      ArrayRef<VPValue *> Operands) {
8698   // FIXME: Support other operations.
8699   unsigned Opcode = HI->Update->getOpcode();
8700   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8701          "Histogram update operation must be an Add or Sub");
8702 
8703   SmallVector<VPValue *, 3> HGramOps;
8704   // Bucket address.
8705   HGramOps.push_back(Operands[1]);
8706   // Increment value.
8707   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8708 
8709   // In case of predicated execution (due to tail-folding, or conditional
8710   // execution, or both), pass the relevant mask.
8711   if (Legal->isMaskRequired(HI->Store))
8712     HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8713 
8714   return new VPHistogramRecipe(Opcode,
8715                                make_range(HGramOps.begin(), HGramOps.end()),
8716                                HI->Store->getDebugLoc());
8717 }
8718 
8719 void VPRecipeBuilder::fixHeaderPhis() {
8720   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8721   for (VPHeaderPHIRecipe *R : PhisToFix) {
8722     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8723     VPRecipeBase *IncR =
8724         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8725     R->addOperand(IncR->getVPSingleValue());
8726   }
8727 }
8728 
8729 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8730                                                       VFRange &Range) {
8731   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8732       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8733       Range);
8734 
8735   bool IsPredicated = CM.isPredicatedInst(I);
8736 
8737   // Even if the instruction is not marked as uniform, there are certain
8738   // intrinsic calls that can be effectively treated as such, so we check for
8739   // them here. Conservatively, we only do this for scalable vectors, since
8740   // for fixed-width VFs we can always fall back on full scalarization.
8741   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8742     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8743     case Intrinsic::assume:
8744     case Intrinsic::lifetime_start:
8745     case Intrinsic::lifetime_end:
8746       // For scalable vectors if one of the operands is variant then we still
8747       // want to mark as uniform, which will generate one instruction for just
8748       // the first lane of the vector. We can't scalarize the call in the same
8749       // way as for fixed-width vectors because we don't know how many lanes
8750       // there are.
8751       //
8752       // The reasons for doing it this way for scalable vectors are:
8753       //   1. For the assume intrinsic generating the instruction for the first
8754       //      lane is still be better than not generating any at all. For
8755       //      example, the input may be a splat across all lanes.
8756       //   2. For the lifetime start/end intrinsics the pointer operand only
8757       //      does anything useful when the input comes from a stack object,
8758       //      which suggests it should always be uniform. For non-stack objects
8759       //      the effect is to poison the object, which still allows us to
8760       //      remove the call.
8761       IsUniform = true;
8762       break;
8763     default:
8764       break;
8765     }
8766   }
8767   VPValue *BlockInMask = nullptr;
8768   if (!IsPredicated) {
8769     // Finalize the recipe for Instr, first if it is not predicated.
8770     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8771   } else {
8772     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8773     // Instructions marked for predication are replicated and a mask operand is
8774     // added initially. Masked replicate recipes will later be placed under an
8775     // if-then construct to prevent side-effects. Generate recipes to compute
8776     // the block mask for this region.
8777     BlockInMask = getBlockInMask(I->getParent());
8778   }
8779 
8780   // Note that there is some custom logic to mark some intrinsics as uniform
8781   // manually above for scalable vectors, which this assert needs to account for
8782   // as well.
8783   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8784           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8785          "Should not predicate a uniform recipe");
8786   auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8787                                        IsUniform, BlockInMask);
8788   return Recipe;
8789 }
8790 
8791 /// Find all possible partial reductions in the loop and track all of those that
8792 /// are valid so recipes can be formed later.
8793 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8794   // Find all possible partial reductions.
8795   SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
8796       PartialReductionChains;
8797   for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
8798     if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8799             getScaledReduction(Phi, RdxDesc, Range))
8800       PartialReductionChains.push_back(*Pair);
8801 
8802   // A partial reduction is invalid if any of its extends are used by
8803   // something that isn't another partial reduction. This is because the
8804   // extends are intended to be lowered along with the reduction itself.
8805 
8806   // Build up a set of partial reduction bin ops for efficient use checking.
8807   SmallSet<User *, 4> PartialReductionBinOps;
8808   for (const auto &[PartialRdx, _] : PartialReductionChains)
8809     PartialReductionBinOps.insert(PartialRdx.BinOp);
8810 
8811   auto ExtendIsOnlyUsedByPartialReductions =
8812       [&PartialReductionBinOps](Instruction *Extend) {
8813         return all_of(Extend->users(), [&](const User *U) {
8814           return PartialReductionBinOps.contains(U);
8815         });
8816       };
8817 
8818   // Check if each use of a chain's two extends is a partial reduction
8819   // and only add those that don't have non-partial reduction users.
8820   for (auto Pair : PartialReductionChains) {
8821     PartialReductionChain Chain = Pair.first;
8822     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8823         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8824       ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
8825   }
8826 }
8827 
8828 std::optional<std::pair<PartialReductionChain, unsigned>>
8829 VPRecipeBuilder::getScaledReduction(PHINode *PHI,
8830                                     const RecurrenceDescriptor &Rdx,
8831                                     VFRange &Range) {
8832   // TODO: Allow scaling reductions when predicating. The select at
8833   // the end of the loop chooses between the phi value and most recent
8834   // reduction result, both of which have different VFs to the active lane
8835   // mask when scaling.
8836   if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
8837     return std::nullopt;
8838 
8839   auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
8840   if (!Update)
8841     return std::nullopt;
8842 
8843   Value *Op = Update->getOperand(0);
8844   Value *PhiOp = Update->getOperand(1);
8845   if (Op == PHI) {
8846     Op = Update->getOperand(1);
8847     PhiOp = Update->getOperand(0);
8848   }
8849   if (PhiOp != PHI)
8850     return std::nullopt;
8851 
8852   auto *BinOp = dyn_cast<BinaryOperator>(Op);
8853   if (!BinOp || !BinOp->hasOneUse())
8854     return std::nullopt;
8855 
8856   using namespace llvm::PatternMatch;
8857   Value *A, *B;
8858   if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8859       !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8860     return std::nullopt;
8861 
8862   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8863   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8864 
8865   TTI::PartialReductionExtendKind OpAExtend =
8866       TargetTransformInfo::getPartialReductionExtendKind(ExtA);
8867   TTI::PartialReductionExtendKind OpBExtend =
8868       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
8869 
8870   PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
8871 
8872   unsigned TargetScaleFactor =
8873       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8874           A->getType()->getPrimitiveSizeInBits());
8875 
8876   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8877           [&](ElementCount VF) {
8878             InstructionCost Cost = TTI->getPartialReductionCost(
8879                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8880                 VF, OpAExtend, OpBExtend,
8881                 std::make_optional(BinOp->getOpcode()));
8882             return Cost.isValid();
8883           },
8884           Range))
8885     return std::make_pair(Chain, TargetScaleFactor);
8886 
8887   return std::nullopt;
8888 }
8889 
8890 VPRecipeBase *
8891 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8892                                         ArrayRef<VPValue *> Operands,
8893                                         VFRange &Range, VPBasicBlock *VPBB) {
8894   // First, check for specific widening recipes that deal with inductions, Phi
8895   // nodes, calls and memory operations.
8896   VPRecipeBase *Recipe;
8897   if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8898     if (Phi->getParent() != OrigLoop->getHeader())
8899       return tryToBlend(Phi, Operands);
8900 
8901     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8902       return Recipe;
8903 
8904     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8905     assert((Legal->isReductionVariable(Phi) ||
8906             Legal->isFixedOrderRecurrence(Phi)) &&
8907            "can only widen reductions and fixed-order recurrences here");
8908     VPValue *StartV = Operands[0];
8909     if (Legal->isReductionVariable(Phi)) {
8910       const RecurrenceDescriptor &RdxDesc =
8911           Legal->getReductionVars().find(Phi)->second;
8912       assert(RdxDesc.getRecurrenceStartValue() ==
8913              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8914 
8915       // If the PHI is used by a partial reduction, set the scale factor.
8916       std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8917           getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
8918       unsigned ScaleFactor = Pair ? Pair->second : 1;
8919       PhiRecipe = new VPReductionPHIRecipe(
8920           Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8921           CM.useOrderedReductions(RdxDesc), ScaleFactor);
8922     } else {
8923       // TODO: Currently fixed-order recurrences are modeled as chains of
8924       // first-order recurrences. If there are no users of the intermediate
8925       // recurrences in the chain, the fixed order recurrence should be modeled
8926       // directly, enabling more efficient codegen.
8927       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8928     }
8929 
8930     PhisToFix.push_back(PhiRecipe);
8931     return PhiRecipe;
8932   }
8933 
8934   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8935                                     cast<TruncInst>(Instr), Operands, Range)))
8936     return Recipe;
8937 
8938   // All widen recipes below deal only with VF > 1.
8939   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8940           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8941     return nullptr;
8942 
8943   if (auto *CI = dyn_cast<CallInst>(Instr))
8944     return tryToWidenCall(CI, Operands, Range);
8945 
8946   if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8947     if (auto HistInfo = Legal->getHistogramInfo(SI))
8948       return tryToWidenHistogram(*HistInfo, Operands);
8949 
8950   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8951     return tryToWidenMemory(Instr, Operands, Range);
8952 
8953   if (getScaledReductionForInstr(Instr))
8954     return tryToCreatePartialReduction(Instr, Operands);
8955 
8956   if (!shouldWiden(Instr, Range))
8957     return nullptr;
8958 
8959   if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8960     return new VPWidenGEPRecipe(GEP,
8961                                 make_range(Operands.begin(), Operands.end()));
8962 
8963   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8964     return new VPWidenSelectRecipe(
8965         *SI, make_range(Operands.begin(), Operands.end()));
8966   }
8967 
8968   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8969     return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8970                                  *CI);
8971   }
8972 
8973   return tryToWiden(Instr, Operands, VPBB);
8974 }
8975 
8976 VPRecipeBase *
8977 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8978                                              ArrayRef<VPValue *> Operands) {
8979   assert(Operands.size() == 2 &&
8980          "Unexpected number of operands for partial reduction");
8981 
8982   VPValue *BinOp = Operands[0];
8983   VPValue *Phi = Operands[1];
8984   if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
8985     std::swap(BinOp, Phi);
8986 
8987   return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
8988                                       Reduction);
8989 }
8990 
8991 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8992                                                         ElementCount MaxVF) {
8993   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8994 
8995   auto MaxVFTimes2 = MaxVF * 2;
8996   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8997     VFRange SubRange = {VF, MaxVFTimes2};
8998     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8999       // Now optimize the initial VPlan.
9000       if (!Plan->hasVF(ElementCount::getFixed(1)))
9001         VPlanTransforms::truncateToMinimalBitwidths(*Plan,
9002                                                     CM.getMinimalBitwidths());
9003       VPlanTransforms::optimize(*Plan);
9004       // TODO: try to put it close to addActiveLaneMask().
9005       // Discard the plan if it is not EVL-compatible
9006       if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
9007                                       *Plan, CM.getMaxSafeElements()))
9008         break;
9009       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9010       VPlans.push_back(std::move(Plan));
9011     }
9012     VF = SubRange.End;
9013   }
9014 }
9015 
9016 // Add the necessary canonical IV and branch recipes required to control the
9017 // loop.
9018 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
9019                                   DebugLoc DL) {
9020   Value *StartIdx = ConstantInt::get(IdxTy, 0);
9021   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
9022 
9023   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
9024   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
9025   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
9026   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
9027   Header->insert(CanonicalIVPHI, Header->begin());
9028 
9029   VPBuilder Builder(TopRegion->getExitingBasicBlock());
9030   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
9031   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
9032       Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
9033       "index.next");
9034   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
9035 
9036   // Add the BranchOnCount VPInstruction to the latch.
9037   Builder.createNaryOp(VPInstruction::BranchOnCount,
9038                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
9039 }
9040 
9041 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
9042 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
9043 /// the end value of the induction.
9044 static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
9045                                                VPBuilder &VectorPHBuilder,
9046                                                VPBuilder &ScalarPHBuilder,
9047                                                VPTypeAnalysis &TypeInfo,
9048                                                VPValue *VectorTC) {
9049   auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
9050   // Truncated wide inductions resume from the last lane of their vector value
9051   // in the last vector iteration which is handled elsewhere.
9052   if (WideIntOrFp && WideIntOrFp->getTruncInst())
9053     return nullptr;
9054 
9055   VPValue *Start = WideIV->getStartValue();
9056   VPValue *Step = WideIV->getStepValue();
9057   const InductionDescriptor &ID = WideIV->getInductionDescriptor();
9058   VPValue *EndValue = VectorTC;
9059   if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
9060     EndValue = VectorPHBuilder.createDerivedIV(
9061         ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
9062         Start, VectorTC, Step);
9063   }
9064 
9065   // EndValue is derived from the vector trip count (which has the same type as
9066   // the widest induction) and thus may be wider than the induction here.
9067   Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
9068   if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
9069     EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
9070                                                 ScalarTypeOfWideIV,
9071                                                 WideIV->getDebugLoc());
9072   }
9073 
9074   auto *ResumePhiRecipe =
9075       ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
9076                                    WideIV->getDebugLoc(), "bc.resume.val");
9077   return ResumePhiRecipe;
9078 }
9079 
9080 /// Create resume phis in the scalar preheader for first-order recurrences,
9081 /// reductions and inductions, and update the VPIRInstructions wrapping the
9082 /// original phis in the scalar header.
9083 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
9084   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9085   auto *ScalarPH = Plan.getScalarPreheader();
9086   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
9087   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9088   VPBuilder VectorPHBuilder(
9089       cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
9090   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9091   VPBuilder ScalarPHBuilder(ScalarPH);
9092   VPValue *OneVPV = Plan.getOrAddLiveIn(
9093       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
9094   for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
9095     auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9096     auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9097     if (!ScalarPhiI)
9098       break;
9099 
9100     auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9101     if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9102       if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
9103               WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9104               &Plan.getVectorTripCount())) {
9105         ScalarPhiIRI->addOperand(ResumePhi);
9106         continue;
9107       }
9108       // TODO: Also handle truncated inductions here. Computing end-values
9109       // separately should be done as VPlan-to-VPlan optimization, after
9110       // legalizing all resume values to use the last lane from the loop.
9111       assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9112              "should only skip truncated wide inductions");
9113       continue;
9114     }
9115 
9116     // The backedge value provides the value to resume coming out of a loop,
9117     // which for FORs is a vector whose last element needs to be extracted. The
9118     // start value provides the value if the loop is bypassed.
9119     bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9120     auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9121     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9122            "Cannot handle loops with uncountable early exits");
9123     if (IsFOR)
9124       ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9125           VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9126           "vector.recur.extract");
9127     StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9128     auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9129         VPInstruction::ResumePhi,
9130         {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9131     ScalarPhiIRI->addOperand(ResumePhiR);
9132   }
9133 }
9134 
9135 /// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
9136 /// either an untruncated wide induction, or if it increments a wide induction
9137 /// by its step.
9138 static bool isOptimizableIVOrUse(VPValue *VPV) {
9139   VPRecipeBase *Def = VPV->getDefiningRecipe();
9140   if (!Def)
9141     return false;
9142   auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
9143   if (WideIV) {
9144     // VPV itself is a wide induction, separately compute the end value for exit
9145     // users if it is not a truncated IV.
9146     return isa<VPWidenPointerInductionRecipe>(WideIV) ||
9147            !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
9148   }
9149 
9150   // Check if VPV is an optimizable induction increment.
9151   if (Def->getNumOperands() != 2)
9152     return false;
9153   WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
9154   if (!WideIV)
9155     WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
9156   if (!WideIV)
9157     return false;
9158 
9159   using namespace VPlanPatternMatch;
9160   auto &ID = WideIV->getInductionDescriptor();
9161 
9162   // Check if VPV increments the induction by the induction step.
9163   VPValue *IVStep = WideIV->getStepValue();
9164   switch (ID.getInductionOpcode()) {
9165   case Instruction::Add:
9166     return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
9167                                                    m_Specific(IVStep)));
9168   case Instruction::FAdd:
9169     return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
9170                                                     m_Specific(IVStep)));
9171   case Instruction::FSub:
9172     return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
9173                                                   m_Specific(IVStep)));
9174   case Instruction::Sub: {
9175     // IVStep will be the negated step of the subtraction. Check if Step == -1 *
9176     // IVStep.
9177     VPValue *Step;
9178     if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
9179         !Step->isLiveIn() || !IVStep->isLiveIn())
9180       return false;
9181     auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
9182     auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
9183     return StepCI && IVStepCI &&
9184            StepCI->getValue() == (-1 * IVStepCI->getValue());
9185   }
9186   default:
9187     return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
9188            match(VPV, m_GetElementPtr(m_Specific(WideIV),
9189                                       m_Specific(WideIV->getStepValue())));
9190   }
9191   llvm_unreachable("should have been covered by switch above");
9192 }
9193 
9194 // Collect VPIRInstructions for phis in the exit blocks that are modeled
9195 // in VPlan and add the exiting VPValue as operand. Some exiting values are not
9196 // modeled explicitly yet and won't be included. Those are un-truncated
9197 // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
9198 // increments.
9199 static SetVector<VPIRInstruction *>
9200 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
9201                          VPlan &Plan) {
9202   auto *MiddleVPBB = Plan.getMiddleBlock();
9203   SetVector<VPIRInstruction *> ExitUsersToFix;
9204   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9205     for (VPRecipeBase &R : *ExitVPBB) {
9206       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9207       if (!ExitIRI)
9208         continue;
9209       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9210       if (!ExitPhi)
9211         break;
9212       for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
9213         BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9214         if (PredVPBB != MiddleVPBB) {
9215           SmallVector<BasicBlock *> ExitingBlocks;
9216           OrigLoop->getExitingBlocks(ExitingBlocks);
9217           assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
9218           ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
9219                                                     : ExitingBlocks[0];
9220         }
9221         Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9222         VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9223         // Exit values for inductions are computed and updated outside of VPlan
9224         // and independent of induction recipes.
9225         // TODO: Compute induction exit values in VPlan.
9226         if (isOptimizableIVOrUse(V) &&
9227             ExitVPBB->getSinglePredecessor() == MiddleVPBB)
9228           continue;
9229         ExitUsersToFix.insert(ExitIRI);
9230         ExitIRI->addOperand(V);
9231       }
9232     }
9233   }
9234   return ExitUsersToFix;
9235 }
9236 
9237 // Add exit values to \p Plan. Extracts are added for each entry in \p
9238 // ExitUsersToFix if needed and their operands are updated. Returns true if all
9239 // exit users can be handled, otherwise return false.
9240 static bool
9241 addUsersInExitBlocks(VPlan &Plan,
9242                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9243   if (ExitUsersToFix.empty())
9244     return true;
9245 
9246   auto *MiddleVPBB = Plan.getMiddleBlock();
9247   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9248 
9249   // Introduce extract for exiting values and update the VPIRInstructions
9250   // modeling the corresponding LCSSA phis.
9251   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9252     for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
9253       // Pass live-in values used by exit phis directly through to their users
9254       // in the exit block.
9255       if (Op->isLiveIn())
9256         continue;
9257 
9258       // Currently only live-ins can be used by exit values from blocks not
9259       // exiting via the vector latch through to the middle block.
9260       if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9261         return false;
9262 
9263       LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
9264       VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
9265                                     {Op, Plan.getOrAddLiveIn(ConstantInt::get(
9266                                              IntegerType::get(Ctx, 32), 1))});
9267       ExitIRI->setOperand(Idx, Ext);
9268     }
9269   }
9270   return true;
9271 }
9272 
9273 /// Handle users in the exit block for first order reductions in the original
9274 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9275 /// users in the original exit block using the VPIRInstruction wrapping to the
9276 /// LCSSA phi.
9277 static void addExitUsersForFirstOrderRecurrences(
9278     VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9279   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9280   auto *ScalarPHVPBB = Plan.getScalarPreheader();
9281   auto *MiddleVPBB = Plan.getMiddleBlock();
9282   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9283   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9284   VPValue *TwoVPV = Plan.getOrAddLiveIn(
9285       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9286 
9287   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9288     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9289     if (!FOR)
9290       continue;
9291 
9292     assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9293            "Cannot handle loops with uncountable early exits");
9294 
9295     // This is the second phase of vectorizing first-order recurrences, creating
9296     // extract for users outside the loop. An overview of the transformation is
9297     // described below. Suppose we have the following loop with some use after
9298     // the loop of the last a[i-1],
9299     //
9300     //   for (int i = 0; i < n; ++i) {
9301     //     t = a[i - 1];
9302     //     b[i] = a[i] - t;
9303     //   }
9304     //   use t;
9305     //
9306     // There is a first-order recurrence on "a". For this loop, the shorthand
9307     // scalar IR looks like:
9308     //
9309     //   scalar.ph:
9310     //     s.init = a[-1]
9311     //     br scalar.body
9312     //
9313     //   scalar.body:
9314     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9315     //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9316     //     s2 = a[i]
9317     //     b[i] = s2 - s1
9318     //     br cond, scalar.body, exit.block
9319     //
9320     //   exit.block:
9321     //     use = lcssa.phi [s1, scalar.body]
9322     //
9323     // In this example, s1 is a recurrence because it's value depends on the
9324     // previous iteration. In the first phase of vectorization, we created a
9325     // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9326     // for users in the scalar preheader and exit block.
9327     //
9328     //   vector.ph:
9329     //     v_init = vector(..., ..., ..., a[-1])
9330     //     br vector.body
9331     //
9332     //   vector.body
9333     //     i = phi [0, vector.ph], [i+4, vector.body]
9334     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
9335     //     v2 = a[i, i+1, i+2, i+3]
9336     //     b[i] = v2 - v1
9337     //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9338     //     b[i, i+1, i+2, i+3] = v2 - v1
9339     //     br cond, vector.body, middle.block
9340     //
9341     //   middle.block:
9342     //     vector.recur.extract.for.phi = v2(2)
9343     //     vector.recur.extract = v2(3)
9344     //     br cond, scalar.ph, exit.block
9345     //
9346     //   scalar.ph:
9347     //     scalar.recur.init = phi [vector.recur.extract, middle.block],
9348     //                             [s.init, otherwise]
9349     //     br scalar.body
9350     //
9351     //   scalar.body:
9352     //     i = phi [0, scalar.ph], [i+1, scalar.body]
9353     //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9354     //     s2 = a[i]
9355     //     b[i] = s2 - s1
9356     //     br cond, scalar.body, exit.block
9357     //
9358     //   exit.block:
9359     //     lo = lcssa.phi [s1, scalar.body],
9360     //                    [vector.recur.extract.for.phi, middle.block]
9361     //
9362     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9363     // Extract the penultimate value of the recurrence and use it as operand for
9364     // the VPIRInstruction modeling the phi.
9365     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9366       if (ExitIRI->getOperand(0) != FOR)
9367         continue;
9368       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9369           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9370           "vector.recur.extract.for.phi");
9371       ExitIRI->setOperand(0, PenultimateElement);
9372       ExitUsersToFix.remove(ExitIRI);
9373     }
9374   }
9375 }
9376 
9377 VPlanPtr
9378 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9379 
9380   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9381 
9382   // ---------------------------------------------------------------------------
9383   // Build initial VPlan: Scan the body of the loop in a topological order to
9384   // visit each basic block after having visited its predecessor basic blocks.
9385   // ---------------------------------------------------------------------------
9386 
9387   // Create initial VPlan skeleton, having a basic block for the pre-header
9388   // which contains SCEV expansions that need to happen before the CFG is
9389   // modified; a basic block for the vector pre-header, followed by a region for
9390   // the vector loop, followed by the middle basic block. The skeleton vector
9391   // loop region contains a header and latch basic blocks.
9392 
9393   bool RequiresScalarEpilogueCheck =
9394       LoopVectorizationPlanner::getDecisionAndClampRange(
9395           [this](ElementCount VF) {
9396             return !CM.requiresScalarEpilogue(VF.isVector());
9397           },
9398           Range);
9399   VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(),
9400                                             PSE, RequiresScalarEpilogueCheck,
9401                                             CM.foldTailByMasking(), OrigLoop);
9402 
9403   // Don't use getDecisionAndClampRange here, because we don't know the UF
9404   // so this function is better to be conservative, rather than to split
9405   // it up into different VPlans.
9406   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9407   bool IVUpdateMayOverflow = false;
9408   for (ElementCount VF : Range)
9409     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9410 
9411   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9412   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9413   // Use NUW for the induction increment if we proved that it won't overflow in
9414   // the vector loop or when not folding the tail. In the later case, we know
9415   // that the canonical induction increment will not overflow as the vector trip
9416   // count is >= increment and a multiple of the increment.
9417   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9418   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9419 
9420   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9421                                 Builder);
9422 
9423   // ---------------------------------------------------------------------------
9424   // Pre-construction: record ingredients whose recipes we'll need to further
9425   // process after constructing the initial VPlan.
9426   // ---------------------------------------------------------------------------
9427 
9428   // For each interleave group which is relevant for this (possibly trimmed)
9429   // Range, add it to the set of groups to be later applied to the VPlan and add
9430   // placeholders for its members' Recipes which we'll be replacing with a
9431   // single VPInterleaveRecipe.
9432   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9433     auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9434       bool Result = (VF.isVector() && // Query is illegal for VF == 1
9435                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
9436                          LoopVectorizationCostModel::CM_Interleave);
9437       // For scalable vectors, the only interleave factor currently supported
9438       // is 2 since we require the (de)interleave2 intrinsics instead of
9439       // shufflevectors.
9440       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9441              "Unsupported interleave factor for scalable vectors");
9442       return Result;
9443     };
9444     if (!getDecisionAndClampRange(ApplyIG, Range))
9445       continue;
9446     InterleaveGroups.insert(IG);
9447   }
9448 
9449   // ---------------------------------------------------------------------------
9450   // Construct recipes for the instructions in the loop
9451   // ---------------------------------------------------------------------------
9452 
9453   // Scan the body of the loop in a topological order to visit each basic block
9454   // after having visited its predecessor basic blocks.
9455   LoopBlocksDFS DFS(OrigLoop);
9456   DFS.perform(LI);
9457 
9458   VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9459   VPBasicBlock *VPBB = HeaderVPBB;
9460   BasicBlock *HeaderBB = OrigLoop->getHeader();
9461   bool NeedsMasks =
9462       CM.foldTailByMasking() ||
9463       any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9464         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9465         return Legal->blockNeedsPredication(BB) || NeedsBlends;
9466       });
9467 
9468   RecipeBuilder.collectScaledReductions(Range);
9469 
9470   auto *MiddleVPBB = Plan->getMiddleBlock();
9471   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9472   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9473     // Relevant instructions from basic block BB will be grouped into VPRecipe
9474     // ingredients and fill a new VPBasicBlock.
9475     if (VPBB != HeaderVPBB)
9476       VPBB->setName(BB->getName());
9477     Builder.setInsertPoint(VPBB);
9478 
9479     if (VPBB == HeaderVPBB)
9480       RecipeBuilder.createHeaderMask();
9481     else if (NeedsMasks)
9482       RecipeBuilder.createBlockInMask(BB);
9483 
9484     // Introduce each ingredient into VPlan.
9485     // TODO: Model and preserve debug intrinsics in VPlan.
9486     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9487       Instruction *Instr = &I;
9488       SmallVector<VPValue *, 4> Operands;
9489       auto *Phi = dyn_cast<PHINode>(Instr);
9490       if (Phi && Phi->getParent() == HeaderBB) {
9491         Operands.push_back(Plan->getOrAddLiveIn(
9492             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9493       } else {
9494         auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9495         Operands = {OpRange.begin(), OpRange.end()};
9496       }
9497 
9498       // The stores with invariant address inside the loop will be deleted, and
9499       // in the exit block, a uniform store recipe will be created for the final
9500       // invariant store of the reduction.
9501       StoreInst *SI;
9502       if ((SI = dyn_cast<StoreInst>(&I)) &&
9503           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9504         // Only create recipe for the final invariant store of the reduction.
9505         if (!Legal->isInvariantStoreOfReduction(SI))
9506           continue;
9507         auto *Recipe = new VPReplicateRecipe(
9508             SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9509             true /* IsUniform */);
9510         Recipe->insertBefore(*MiddleVPBB, MBIP);
9511         continue;
9512       }
9513 
9514       VPRecipeBase *Recipe =
9515           RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9516       if (!Recipe)
9517         Recipe = RecipeBuilder.handleReplication(Instr, Range);
9518 
9519       RecipeBuilder.setRecipe(Instr, Recipe);
9520       if (isa<VPHeaderPHIRecipe>(Recipe)) {
9521         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9522         // the following cases, VPHeaderPHIRecipes may be created after non-phi
9523         // recipes and need to be moved to the phi section of HeaderVPBB:
9524         // * tail-folding (non-phi recipes computing the header mask are
9525         // introduced earlier than regular header phi recipes, and should appear
9526         // after them)
9527         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9528 
9529         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9530                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9531                "unexpected recipe needs moving");
9532         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9533       } else
9534         VPBB->appendRecipe(Recipe);
9535     }
9536 
9537     VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9538     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9539   }
9540 
9541   // After here, VPBB should not be used.
9542   VPBB = nullptr;
9543 
9544   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9545          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9546          "entry block must be set to a VPRegionBlock having a non-empty entry "
9547          "VPBasicBlock");
9548   RecipeBuilder.fixHeaderPhis();
9549 
9550   // Update wide induction increments to use the same step as the corresponding
9551   // wide induction. This enables detecting induction increments directly in
9552   // VPlan and removes redundant splats.
9553   for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9554     auto *IVInc = cast<Instruction>(
9555         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9556     if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9557       continue;
9558     VPWidenInductionRecipe *WideIV =
9559         cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9560     VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9561     R->setOperand(1, WideIV->getStepValue());
9562   }
9563 
9564   if (auto *UncountableExitingBlock =
9565           Legal->getUncountableEarlyExitingBlock()) {
9566     VPlanTransforms::handleUncountableEarlyExit(
9567         *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9568   }
9569   addScalarResumePhis(RecipeBuilder, *Plan);
9570   SetVector<VPIRInstruction *> ExitUsersToFix =
9571       collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9572   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9573   if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9574     reportVectorizationFailure(
9575         "Some exit values in loop with uncountable exit not supported yet",
9576         "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9577     return nullptr;
9578   }
9579 
9580   // ---------------------------------------------------------------------------
9581   // Transform initial VPlan: Apply previously taken decisions, in order, to
9582   // bring the VPlan to its final state.
9583   // ---------------------------------------------------------------------------
9584 
9585   // Adjust the recipes for any inloop reductions.
9586   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9587 
9588   // Interleave memory: for each Interleave Group we marked earlier as relevant
9589   // for this VPlan, replace the Recipes widening its memory instructions with a
9590   // single VPInterleaveRecipe at its insertion point.
9591   VPlanTransforms::createInterleaveGroups(
9592       *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
9593 
9594   for (ElementCount VF : Range)
9595     Plan->addVF(VF);
9596   Plan->setName("Initial VPlan");
9597 
9598   // Replace VPValues for known constant strides guaranteed by predicate scalar
9599   // evolution.
9600   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9601     auto *R = cast<VPRecipeBase>(&U);
9602     return R->getParent()->getParent() ||
9603            R->getParent() ==
9604                Plan->getVectorLoopRegion()->getSinglePredecessor();
9605   };
9606   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9607     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9608     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9609     // Only handle constant strides for now.
9610     if (!ScevStride)
9611       continue;
9612 
9613     auto *CI = Plan->getOrAddLiveIn(
9614         ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9615     if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9616       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9617 
9618     // The versioned value may not be used in the loop directly but through a
9619     // sext/zext. Add new live-ins in those cases.
9620     for (Value *U : StrideV->users()) {
9621       if (!isa<SExtInst, ZExtInst>(U))
9622         continue;
9623       VPValue *StrideVPV = Plan->getLiveIn(U);
9624       if (!StrideVPV)
9625         continue;
9626       unsigned BW = U->getType()->getScalarSizeInBits();
9627       APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9628                                  : ScevStride->getAPInt().zext(BW);
9629       VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9630       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9631     }
9632   }
9633 
9634   VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
9635     return Legal->blockNeedsPredication(BB);
9636   });
9637 
9638   // Sink users of fixed-order recurrence past the recipe defining the previous
9639   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9640   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
9641     return nullptr;
9642 
9643   if (useActiveLaneMask(Style)) {
9644     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9645     // TailFoldingStyle is visible there.
9646     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9647     bool WithoutRuntimeCheck =
9648         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
9649     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9650                                        WithoutRuntimeCheck);
9651   }
9652 
9653   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9654   return Plan;
9655 }
9656 
9657 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9658   // Outer loop handling: They may require CFG and instruction level
9659   // transformations before even evaluating whether vectorization is profitable.
9660   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9661   // the vectorization pipeline.
9662   assert(!OrigLoop->isInnermost());
9663   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9664 
9665   // Create new empty VPlan
9666   auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9667                                         true, false, OrigLoop);
9668 
9669   // Build hierarchical CFG
9670   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9671   HCFGBuilder.buildHierarchicalCFG();
9672 
9673   for (ElementCount VF : Range)
9674     Plan->addVF(VF);
9675 
9676   VPlanTransforms::VPInstructionsToVPRecipes(
9677       Plan,
9678       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9679       *PSE.getSE(), *TLI);
9680 
9681   // Remove the existing terminator of the exiting block of the top-most region.
9682   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9683   auto *Term =
9684       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9685   Term->eraseFromParent();
9686 
9687   // Tail folding is not supported for outer loops, so the induction increment
9688   // is guaranteed to not wrap.
9689   bool HasNUW = true;
9690   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9691                         DebugLoc());
9692 
9693   // Collect mapping of IR header phis to header phi recipes, to be used in
9694   // addScalarResumePhis.
9695   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9696                                 Builder);
9697   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9698     if (isa<VPCanonicalIVPHIRecipe>(&R))
9699       continue;
9700     auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9701     RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9702   }
9703   addScalarResumePhis(RecipeBuilder, *Plan);
9704 
9705   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9706   return Plan;
9707 }
9708 
9709 // Adjust the recipes for reductions. For in-loop reductions the chain of
9710 // instructions leading from the loop exit instr to the phi need to be converted
9711 // to reductions, with one operand being vector and the other being the scalar
9712 // reduction chain. For other reductions, a select is introduced between the phi
9713 // and users outside the vector region when folding the tail.
9714 //
9715 // A ComputeReductionResult recipe is added to the middle block, also for
9716 // in-loop reductions which compute their result in-loop, because generating
9717 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9718 //
9719 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9720 // with a boolean reduction phi node to check if the condition is true in any
9721 // iteration. The final value is selected by the final ComputeReductionResult.
9722 void LoopVectorizationPlanner::adjustRecipesForReductions(
9723     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9724   using namespace VPlanPatternMatch;
9725   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9726   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9727   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9728   SmallVector<VPRecipeBase *> ToDelete;
9729 
9730   for (VPRecipeBase &R : Header->phis()) {
9731     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9732     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9733       continue;
9734 
9735     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9736     RecurKind Kind = RdxDesc.getRecurrenceKind();
9737     assert(
9738         !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9739         !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
9740         "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9741 
9742     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9743     SetVector<VPSingleDefRecipe *> Worklist;
9744     Worklist.insert(PhiR);
9745     for (unsigned I = 0; I != Worklist.size(); ++I) {
9746       VPSingleDefRecipe *Cur = Worklist[I];
9747       for (VPUser *U : Cur->users()) {
9748         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9749         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9750           assert((UserRecipe->getParent() == MiddleVPBB ||
9751                   UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9752                  "U must be either in the loop region, the middle block or the "
9753                  "scalar preheader.");
9754           continue;
9755         }
9756         Worklist.insert(UserRecipe);
9757       }
9758     }
9759 
9760     // Visit operation "Links" along the reduction chain top-down starting from
9761     // the phi until LoopExitValue. We keep track of the previous item
9762     // (PreviousLink) to tell which of the two operands of a Link will remain
9763     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9764     // the select instructions. Blend recipes of in-loop reduction phi's  will
9765     // get folded to their non-phi operand, as the reduction recipe handles the
9766     // condition directly.
9767     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9768     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9769       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9770 
9771       // Index of the first operand which holds a non-mask vector operand.
9772       unsigned IndexOfFirstOperand;
9773       // Recognize a call to the llvm.fmuladd intrinsic.
9774       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9775       VPValue *VecOp;
9776       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9777       if (IsFMulAdd) {
9778         assert(
9779             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9780             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9781         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9782                 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9783                CurrentLink->getOperand(2) == PreviousLink &&
9784                "expected a call where the previous link is the added operand");
9785 
9786         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9787         // need to create an fmul recipe (multiplying the first two operands of
9788         // the fmuladd together) to use as the vector operand for the fadd
9789         // reduction.
9790         VPInstruction *FMulRecipe = new VPInstruction(
9791             Instruction::FMul,
9792             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9793             CurrentLinkI->getFastMathFlags());
9794         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9795         VecOp = FMulRecipe;
9796       } else {
9797         auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9798         if (PhiR->isInLoop() && Blend) {
9799           assert(Blend->getNumIncomingValues() == 2 &&
9800                  "Blend must have 2 incoming values");
9801           if (Blend->getIncomingValue(0) == PhiR)
9802             Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9803           else {
9804             assert(Blend->getIncomingValue(1) == PhiR &&
9805                    "PhiR must be an operand of the blend");
9806             Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9807           }
9808           continue;
9809         }
9810 
9811         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9812           if (isa<VPWidenRecipe>(CurrentLink)) {
9813             assert(isa<CmpInst>(CurrentLinkI) &&
9814                    "need to have the compare of the select");
9815             continue;
9816           }
9817           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9818                  "must be a select recipe");
9819           IndexOfFirstOperand = 1;
9820         } else {
9821           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9822                  "Expected to replace a VPWidenSC");
9823           IndexOfFirstOperand = 0;
9824         }
9825         // Note that for non-commutable operands (cmp-selects), the semantics of
9826         // the cmp-select are captured in the recurrence kind.
9827         unsigned VecOpId =
9828             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9829                 ? IndexOfFirstOperand + 1
9830                 : IndexOfFirstOperand;
9831         VecOp = CurrentLink->getOperand(VecOpId);
9832         assert(VecOp != PreviousLink &&
9833                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9834                                        (VecOpId - IndexOfFirstOperand)) ==
9835                    PreviousLink &&
9836                "PreviousLink must be the operand other than VecOp");
9837       }
9838 
9839       BasicBlock *BB = CurrentLinkI->getParent();
9840       VPValue *CondOp = nullptr;
9841       if (CM.blockNeedsPredicationForAnyReason(BB))
9842         CondOp = RecipeBuilder.getBlockInMask(BB);
9843 
9844       auto *RedRecipe = new VPReductionRecipe(
9845           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9846           CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9847       // Append the recipe to the end of the VPBasicBlock because we need to
9848       // ensure that it comes after all of it's inputs, including CondOp.
9849       // Delete CurrentLink as it will be invalid if its operand is replaced
9850       // with a reduction defined at the bottom of the block in the next link.
9851       LinkVPBB->appendRecipe(RedRecipe);
9852       CurrentLink->replaceAllUsesWith(RedRecipe);
9853       ToDelete.push_back(CurrentLink);
9854       PreviousLink = RedRecipe;
9855     }
9856   }
9857   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9858   Builder.setInsertPoint(&*LatchVPBB->begin());
9859   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9860   for (VPRecipeBase &R :
9861        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9862     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9863     if (!PhiR)
9864       continue;
9865 
9866     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9867     // If tail is folded by masking, introduce selects between the phi
9868     // and the users outside the vector region of each reduction, at the
9869     // beginning of the dedicated latch block.
9870     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9871     auto *NewExitingVPV = PhiR->getBackedgeValue();
9872     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9873       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9874       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9875              "reduction recipe must be defined before latch");
9876       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9877       std::optional<FastMathFlags> FMFs =
9878           PhiTy->isFloatingPointTy()
9879               ? std::make_optional(RdxDesc.getFastMathFlags())
9880               : std::nullopt;
9881       NewExitingVPV =
9882           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9883       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9884         return isa<VPInstruction>(&U) &&
9885                cast<VPInstruction>(&U)->getOpcode() ==
9886                    VPInstruction::ComputeReductionResult;
9887       });
9888       if (CM.usePredicatedReductionSelect(
9889               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9890         PhiR->setOperand(1, NewExitingVPV);
9891     }
9892 
9893     // If the vector reduction can be performed in a smaller type, we truncate
9894     // then extend the loop exit value to enable InstCombine to evaluate the
9895     // entire expression in the smaller type.
9896     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9897     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9898         !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9899             RdxDesc.getRecurrenceKind())) {
9900       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9901       Type *RdxTy = RdxDesc.getRecurrenceType();
9902       auto *Trunc =
9903           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9904       auto *Extnd =
9905           RdxDesc.isSigned()
9906               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9907               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9908 
9909       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9910       Extnd->insertAfter(Trunc);
9911       if (PhiR->getOperand(1) == NewExitingVPV)
9912         PhiR->setOperand(1, Extnd->getVPSingleValue());
9913       NewExitingVPV = Extnd;
9914     }
9915 
9916     // We want code in the middle block to appear to execute on the location of
9917     // the scalar loop's latch terminator because: (a) it is all compiler
9918     // generated, (b) these instructions are always executed after evaluating
9919     // the latch conditional branch, and (c) other passes may add new
9920     // predecessors which terminate on this line. This is the easiest way to
9921     // ensure we don't accidentally cause an extra step back into the loop while
9922     // debugging.
9923     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9924 
9925     // TODO: At the moment ComputeReductionResult also drives creation of the
9926     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9927     // even for in-loop reductions, until the reduction resume value handling is
9928     // also modeled in VPlan.
9929     auto *FinalReductionResult = new VPInstruction(
9930         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9931     // Update all users outside the vector region.
9932     OrigExitingVPV->replaceUsesWithIf(
9933         FinalReductionResult, [](VPUser &User, unsigned) {
9934           auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9935           return Parent && !Parent->getParent();
9936         });
9937     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9938 
9939     // Adjust AnyOf reductions; replace the reduction phi for the selected value
9940     // with a boolean reduction phi node to check if the condition is true in
9941     // any iteration. The final value is selected by the final
9942     // ComputeReductionResult.
9943     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9944             RdxDesc.getRecurrenceKind())) {
9945       auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9946         return isa<VPWidenSelectRecipe>(U) ||
9947                (isa<VPReplicateRecipe>(U) &&
9948                 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9949                     Instruction::Select);
9950       }));
9951       VPValue *Cmp = Select->getOperand(0);
9952       // If the compare is checking the reduction PHI node, adjust it to check
9953       // the start value.
9954       if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9955         for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9956           if (CmpR->getOperand(I) == PhiR)
9957             CmpR->setOperand(I, PhiR->getStartValue());
9958       }
9959       VPBuilder::InsertPointGuard Guard(Builder);
9960       Builder.setInsertPoint(Select);
9961 
9962       // If the true value of the select is the reduction phi, the new value is
9963       // selected if the negated condition is true in any iteration.
9964       if (Select->getOperand(1) == PhiR)
9965         Cmp = Builder.createNot(Cmp);
9966       VPValue *Or = Builder.createOr(PhiR, Cmp);
9967       Select->getVPSingleValue()->replaceAllUsesWith(Or);
9968       // Delete Select now that it has invalid types.
9969       ToDelete.push_back(Select);
9970 
9971       // Convert the reduction phi to operate on bools.
9972       PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9973                               OrigLoop->getHeader()->getContext())));
9974       continue;
9975     }
9976 
9977     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
9978             RdxDesc.getRecurrenceKind())) {
9979       // Adjust the start value for FindLastIV recurrences to use the sentinel
9980       // value after generating the ResumePhi recipe, which uses the original
9981       // start value.
9982       PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9983     }
9984   }
9985 
9986   VPlanTransforms::clearReductionWrapFlags(*Plan);
9987   for (VPRecipeBase *R : ToDelete)
9988     R->eraseFromParent();
9989 }
9990 
9991 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9992   assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9993 
9994   // Fast-math-flags propagate from the original induction instruction.
9995   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9996   if (FPBinOp)
9997     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9998 
9999   Value *Step = State.get(getStepValue(), VPLane(0));
10000   Value *Index = State.get(getOperand(1), VPLane(0));
10001   Value *DerivedIV = emitTransformedIndex(
10002       State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
10003       cast_if_present<BinaryOperator>(FPBinOp));
10004   DerivedIV->setName(Name);
10005   // If index is the vector trip count, the concrete value will only be set in
10006   // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
10007   // TODO: Remove the special case for the vector trip count once it is computed
10008   // in VPlan and can be used during VPlan simplification.
10009   assert((DerivedIV != Index ||
10010           getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
10011          "IV didn't need transforming?");
10012   State.set(this, DerivedIV, VPLane(0));
10013 }
10014 
10015 void VPReplicateRecipe::execute(VPTransformState &State) {
10016   Instruction *UI = getUnderlyingInstr();
10017   if (State.Lane) { // Generate a single instance.
10018     assert((State.VF.isScalar() || !isUniform()) &&
10019            "uniform recipe shouldn't be predicated");
10020     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
10021     State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
10022     // Insert scalar instance packing it into a vector.
10023     if (State.VF.isVector() && shouldPack()) {
10024       // If we're constructing lane 0, initialize to start from poison.
10025       if (State.Lane->isFirstLane()) {
10026         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
10027         Value *Poison = PoisonValue::get(
10028             VectorType::get(UI->getType(), State.VF));
10029         State.set(this, Poison);
10030       }
10031       State.packScalarIntoVectorValue(this, *State.Lane);
10032     }
10033     return;
10034   }
10035 
10036   if (IsUniform) {
10037     // Uniform within VL means we need to generate lane 0.
10038     State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
10039     return;
10040   }
10041 
10042   // A store of a loop varying value to a uniform address only needs the last
10043   // copy of the store.
10044   if (isa<StoreInst>(UI) &&
10045       vputils::isUniformAfterVectorization(getOperand(1))) {
10046     auto Lane = VPLane::getLastLaneForVF(State.VF);
10047     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10048     return;
10049   }
10050 
10051   // Generate scalar instances for all VF lanes.
10052   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
10053   const unsigned EndLane = State.VF.getKnownMinValue();
10054   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
10055     State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10056 }
10057 
10058 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10059 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10060 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10061 // for predication.
10062 static ScalarEpilogueLowering getScalarEpilogueLowering(
10063     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10064     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10065     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
10066   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10067   // don't look at hints or options, and don't request a scalar epilogue.
10068   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10069   // LoopAccessInfo (due to code dependency and not being able to reliably get
10070   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10071   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10072   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10073   // back to the old way and vectorize with versioning when forced. See D81345.)
10074   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10075                                                       PGSOQueryType::IRPass) &&
10076                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10077     return CM_ScalarEpilogueNotAllowedOptSize;
10078 
10079   // 2) If set, obey the directives
10080   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10081     switch (PreferPredicateOverEpilogue) {
10082     case PreferPredicateTy::ScalarEpilogue:
10083       return CM_ScalarEpilogueAllowed;
10084     case PreferPredicateTy::PredicateElseScalarEpilogue:
10085       return CM_ScalarEpilogueNotNeededUsePredicate;
10086     case PreferPredicateTy::PredicateOrDontVectorize:
10087       return CM_ScalarEpilogueNotAllowedUsePredicate;
10088     };
10089   }
10090 
10091   // 3) If set, obey the hints
10092   switch (Hints.getPredicate()) {
10093   case LoopVectorizeHints::FK_Enabled:
10094     return CM_ScalarEpilogueNotNeededUsePredicate;
10095   case LoopVectorizeHints::FK_Disabled:
10096     return CM_ScalarEpilogueAllowed;
10097   };
10098 
10099   // 4) if the TTI hook indicates this is profitable, request predication.
10100   TailFoldingInfo TFI(TLI, &LVL, IAI);
10101   if (TTI->preferPredicateOverEpilogue(&TFI))
10102     return CM_ScalarEpilogueNotNeededUsePredicate;
10103 
10104   return CM_ScalarEpilogueAllowed;
10105 }
10106 
10107 // Process the loop in the VPlan-native vectorization path. This path builds
10108 // VPlan upfront in the vectorization pipeline, which allows to apply
10109 // VPlan-to-VPlan transformations from the very beginning without modifying the
10110 // input LLVM IR.
10111 static bool processLoopInVPlanNativePath(
10112     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10113     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10114     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10115     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10116     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10117     LoopVectorizationRequirements &Requirements) {
10118 
10119   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10120     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10121     return false;
10122   }
10123   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10124   Function *F = L->getHeader()->getParent();
10125   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10126 
10127   ScalarEpilogueLowering SEL =
10128       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
10129 
10130   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10131                                 &Hints, IAI);
10132   // Use the planner for outer loop vectorization.
10133   // TODO: CM is not used at this point inside the planner. Turn CM into an
10134   // optional argument if we don't need it in the future.
10135   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
10136                                ORE);
10137 
10138   // Get user vectorization factor.
10139   ElementCount UserVF = Hints.getWidth();
10140 
10141   CM.collectElementTypesForWidening();
10142 
10143   // Plan how to best vectorize, return the best VF and its cost.
10144   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10145 
10146   // If we are stress testing VPlan builds, do not attempt to generate vector
10147   // code. Masked vector code generation support will follow soon.
10148   // Also, do not attempt to vectorize if no vector code will be produced.
10149   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
10150     return false;
10151 
10152   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10153 
10154   {
10155     bool AddBranchWeights =
10156         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10157     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10158                              AddBranchWeights);
10159     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10160                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10161     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10162                       << L->getHeader()->getParent()->getName() << "\"\n");
10163     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10164   }
10165 
10166   reportVectorization(ORE, L, VF, 1);
10167 
10168   // Mark the loop as already vectorized to avoid vectorizing again.
10169   Hints.setAlreadyVectorized();
10170   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10171   return true;
10172 }
10173 
10174 // Emit a remark if there are stores to floats that required a floating point
10175 // extension. If the vectorized loop was generated with floating point there
10176 // will be a performance penalty from the conversion overhead and the change in
10177 // the vector width.
10178 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10179   SmallVector<Instruction *, 4> Worklist;
10180   for (BasicBlock *BB : L->getBlocks()) {
10181     for (Instruction &Inst : *BB) {
10182       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10183         if (S->getValueOperand()->getType()->isFloatTy())
10184           Worklist.push_back(S);
10185       }
10186     }
10187   }
10188 
10189   // Traverse the floating point stores upwards searching, for floating point
10190   // conversions.
10191   SmallPtrSet<const Instruction *, 4> Visited;
10192   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10193   while (!Worklist.empty()) {
10194     auto *I = Worklist.pop_back_val();
10195     if (!L->contains(I))
10196       continue;
10197     if (!Visited.insert(I).second)
10198       continue;
10199 
10200     // Emit a remark if the floating point store required a floating
10201     // point conversion.
10202     // TODO: More work could be done to identify the root cause such as a
10203     // constant or a function return type and point the user to it.
10204     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10205       ORE->emit([&]() {
10206         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10207                                           I->getDebugLoc(), L->getHeader())
10208                << "floating point conversion changes vector width. "
10209                << "Mixed floating point precision requires an up/down "
10210                << "cast that will negatively impact performance.";
10211       });
10212 
10213     for (Use &Op : I->operands())
10214       if (auto *OpI = dyn_cast<Instruction>(Op))
10215         Worklist.push_back(OpI);
10216   }
10217 }
10218 
10219 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10220                                        VectorizationFactor &VF, Loop *L,
10221                                        const TargetTransformInfo &TTI,
10222                                        PredicatedScalarEvolution &PSE,
10223                                        ScalarEpilogueLowering SEL) {
10224   InstructionCost CheckCost = Checks.getCost();
10225   if (!CheckCost.isValid())
10226     return false;
10227 
10228   // When interleaving only scalar and vector cost will be equal, which in turn
10229   // would lead to a divide by 0. Fall back to hard threshold.
10230   if (VF.Width.isScalar()) {
10231     if (CheckCost > VectorizeMemoryCheckThreshold) {
10232       LLVM_DEBUG(
10233           dbgs()
10234           << "LV: Interleaving only is not profitable due to runtime checks\n");
10235       return false;
10236     }
10237     return true;
10238   }
10239 
10240   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10241   uint64_t ScalarC = *VF.ScalarCost.getValue();
10242   if (ScalarC == 0)
10243     return true;
10244 
10245   // First, compute the minimum iteration count required so that the vector
10246   // loop outperforms the scalar loop.
10247   //  The total cost of the scalar loop is
10248   //   ScalarC * TC
10249   //  where
10250   //  * TC is the actual trip count of the loop.
10251   //  * ScalarC is the cost of a single scalar iteration.
10252   //
10253   //  The total cost of the vector loop is
10254   //    RtC + VecC * (TC / VF) + EpiC
10255   //  where
10256   //  * RtC is the cost of the generated runtime checks
10257   //  * VecC is the cost of a single vector iteration.
10258   //  * TC is the actual trip count of the loop
10259   //  * VF is the vectorization factor
10260   //  * EpiCost is the cost of the generated epilogue, including the cost
10261   //    of the remaining scalar operations.
10262   //
10263   // Vectorization is profitable once the total vector cost is less than the
10264   // total scalar cost:
10265   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10266   //
10267   // Now we can compute the minimum required trip count TC as
10268   //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10269   //
10270   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10271   // the computations are performed on doubles, not integers and the result
10272   // is rounded up, hence we get an upper estimate of the TC.
10273   unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10274   uint64_t RtC = *CheckCost.getValue();
10275   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10276   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10277 
10278   // Second, compute a minimum iteration count so that the cost of the
10279   // runtime checks is only a fraction of the total scalar loop cost. This
10280   // adds a loop-dependent bound on the overhead incurred if the runtime
10281   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10282   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10283   // cost, compute
10284   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10285   uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10286 
10287   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10288   // epilogue is allowed, choose the next closest multiple of VF. This should
10289   // partly compensate for ignoring the epilogue cost.
10290   uint64_t MinTC = std::max(MinTC1, MinTC2);
10291   if (SEL == CM_ScalarEpilogueAllowed)
10292     MinTC = alignTo(MinTC, IntVF);
10293   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
10294 
10295   LLVM_DEBUG(
10296       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10297              << VF.MinProfitableTripCount << "\n");
10298 
10299   // Skip vectorization if the expected trip count is less than the minimum
10300   // required trip count.
10301   if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10302     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10303                                 VF.MinProfitableTripCount)) {
10304       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10305                            "trip count < minimum profitable VF ("
10306                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10307                         << ")\n");
10308 
10309       return false;
10310     }
10311   }
10312   return true;
10313 }
10314 
10315 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10316     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10317                                !EnableLoopInterleaving),
10318       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10319                               !EnableLoopVectorization) {}
10320 
10321 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10322 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10323 /// don't have a corresponding wide induction in \p EpiPlan.
10324 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10325   // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10326   // will need their resume-values computed in the main vector loop. Others
10327   // can be removed from the main VPlan.
10328   SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10329   for (VPRecipeBase &R :
10330        EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
10331     if (isa<VPCanonicalIVPHIRecipe>(&R))
10332       continue;
10333     EpiWidenedPhis.insert(
10334         cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10335   }
10336   for (VPRecipeBase &R : make_early_inc_range(
10337            *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10338     auto *VPIRInst = cast<VPIRInstruction>(&R);
10339     auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10340     if (!IRI)
10341       break;
10342     if (EpiWidenedPhis.contains(IRI))
10343       continue;
10344     // There is no corresponding wide induction in the epilogue plan that would
10345     // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10346     // together with the corresponding ResumePhi. The resume values for the
10347     // scalar loop will be created during execution of EpiPlan.
10348     VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10349     VPIRInst->eraseFromParent();
10350     ResumePhi->eraseFromParent();
10351   }
10352   VPlanTransforms::removeDeadRecipes(MainPlan);
10353 
10354   using namespace VPlanPatternMatch;
10355   VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10356   VPValue *VectorTC = &MainPlan.getVectorTripCount();
10357   // If there is a suitable resume value for the canonical induction in the
10358   // scalar (which will become vector) epilogue loop we are done. Otherwise
10359   // create it below.
10360   if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10361         return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10362                              m_Specific(VectorTC), m_SpecificInt(0)));
10363       }))
10364     return;
10365   VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10366   ScalarPHBuilder.createNaryOp(
10367       VPInstruction::ResumePhi,
10368       {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10369       "vec.epilog.resume.val");
10370 }
10371 
10372 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10373 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10374 static void
10375 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
10376                                  const SCEV2ValueTy &ExpandedSCEVs,
10377                                  const EpilogueLoopVectorizationInfo &EPI) {
10378   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10379   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10380   Header->setName("vec.epilog.vector.body");
10381 
10382   // Re-use the trip count and steps expanded for the main loop, as
10383   // skeleton creation needs it as a value that dominates both the scalar
10384   // and vector epilogue loops
10385   // TODO: This is a workaround needed for epilogue vectorization and it
10386   // should be removed once induction resume value creation is done
10387   // directly in VPlan.
10388   for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10389     auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10390     if (!ExpandR)
10391       continue;
10392     auto *ExpandedVal =
10393         Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10394     ExpandR->replaceAllUsesWith(ExpandedVal);
10395     if (Plan.getTripCount() == ExpandR)
10396       Plan.resetTripCount(ExpandedVal);
10397     ExpandR->eraseFromParent();
10398   }
10399 
10400   // Ensure that the start values for all header phi recipes are updated before
10401   // vectorizing the epilogue loop.
10402   for (VPRecipeBase &R : Header->phis()) {
10403     if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10404       // When vectorizing the epilogue loop, the canonical induction start
10405       // value needs to be changed from zero to the value after the main
10406       // vector loop. Find the resume value created during execution of the main
10407       // VPlan.
10408       // FIXME: Improve modeling for canonical IV start values in the epilogue
10409       // loop.
10410       BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10411           predecessors(L->getLoopPreheader()),
10412           [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10413             if (BB != EPI.MainLoopIterationCountCheck &&
10414                 BB != EPI.EpilogueIterationCountCheck &&
10415                 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10416               return BB;
10417             return nullptr;
10418           });
10419       using namespace llvm::PatternMatch;
10420       Type *IdxTy = IV->getScalarType();
10421       PHINode *EPResumeVal = find_singleton<PHINode>(
10422           L->getLoopPreheader()->phis(),
10423           [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10424             if (P.getType() == IdxTy &&
10425                 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10426                 match(
10427                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10428                     m_SpecificInt(0)))
10429               return &P;
10430             return nullptr;
10431           });
10432       assert(EPResumeVal && "must have a resume value for the canonical IV");
10433       VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10434       assert(all_of(IV->users(),
10435                     [](const VPUser *U) {
10436                       return isa<VPScalarIVStepsRecipe>(U) ||
10437                              isa<VPScalarCastRecipe>(U) ||
10438                              isa<VPDerivedIVRecipe>(U) ||
10439                              cast<VPInstruction>(U)->getOpcode() ==
10440                                  Instruction::Add;
10441                     }) &&
10442              "the canonical IV should only be used by its increment or "
10443              "ScalarIVSteps when resetting the start value");
10444       IV->setOperand(0, VPV);
10445       continue;
10446     }
10447 
10448     Value *ResumeV = nullptr;
10449     // TODO: Move setting of resume values to prepareToExecute.
10450     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10451       ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10452                     ->getIncomingValueForBlock(L->getLoopPreheader());
10453       const RecurrenceDescriptor &RdxDesc =
10454           ReductionPhi->getRecurrenceDescriptor();
10455       RecurKind RK = RdxDesc.getRecurrenceKind();
10456       if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
10457         // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10458         // start value; compare the final value from the main vector loop
10459         // to the start value.
10460         IRBuilder<> Builder(
10461             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10462         ResumeV =
10463             Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10464       } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
10465         // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10466         // to the resume value. The resume value is adjusted to the sentinel
10467         // value when the final value from the main vector loop equals the start
10468         // value. This ensures correctness when the start value might not be
10469         // less than the minimum value of a monotonically increasing induction
10470         // variable.
10471         IRBuilder<> Builder(
10472             cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10473         Value *Cmp =
10474             Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10475         ResumeV =
10476             Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10477       }
10478     } else {
10479       // Retrieve the induction resume values for wide inductions from
10480       // their original phi nodes in the scalar loop.
10481       PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10482       // Hook up to the PHINode generated by a ResumePhi recipe of main
10483       // loop VPlan, which feeds the scalar loop.
10484       ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10485     }
10486     assert(ResumeV && "Must have a resume value");
10487     VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10488     cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10489   }
10490 }
10491 
10492 bool LoopVectorizePass::processLoop(Loop *L) {
10493   assert((EnableVPlanNativePath || L->isInnermost()) &&
10494          "VPlan-native path is not enabled. Only process inner loops.");
10495 
10496   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10497                     << L->getHeader()->getParent()->getName() << "' from "
10498                     << L->getLocStr() << "\n");
10499 
10500   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10501 
10502   LLVM_DEBUG(
10503       dbgs() << "LV: Loop hints:"
10504              << " force="
10505              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10506                      ? "disabled"
10507                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10508                             ? "enabled"
10509                             : "?"))
10510              << " width=" << Hints.getWidth()
10511              << " interleave=" << Hints.getInterleave() << "\n");
10512 
10513   // Function containing loop
10514   Function *F = L->getHeader()->getParent();
10515 
10516   // Looking at the diagnostic output is the only way to determine if a loop
10517   // was vectorized (other than looking at the IR or machine code), so it
10518   // is important to generate an optimization remark for each loop. Most of
10519   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10520   // generated as OptimizationRemark and OptimizationRemarkMissed are
10521   // less verbose reporting vectorized loops and unvectorized loops that may
10522   // benefit from vectorization, respectively.
10523 
10524   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10525     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10526     return false;
10527   }
10528 
10529   PredicatedScalarEvolution PSE(*SE, *L);
10530 
10531   // Check if it is legal to vectorize the loop.
10532   LoopVectorizationRequirements Requirements;
10533   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10534                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10535   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10536     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10537     Hints.emitRemarkWithHints();
10538     return false;
10539   }
10540 
10541   if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10542     reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10543                                "early exit is not enabled",
10544                                "UncountableEarlyExitLoopsDisabled", ORE, L);
10545     return false;
10546   }
10547 
10548   if (LVL.hasStructVectorCall()) {
10549     reportVectorizationFailure("Auto-vectorization of calls that return struct "
10550                                "types is not yet supported",
10551                                "StructCallVectorizationUnsupported", ORE, L);
10552     return false;
10553   }
10554 
10555   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10556   // here. They may require CFG and instruction level transformations before
10557   // even evaluating whether vectorization is profitable. Since we cannot modify
10558   // the incoming IR, we need to build VPlan upfront in the vectorization
10559   // pipeline.
10560   if (!L->isInnermost())
10561     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10562                                         ORE, BFI, PSI, Hints, Requirements);
10563 
10564   assert(L->isInnermost() && "Inner loop expected.");
10565 
10566   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10567   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10568 
10569   // If an override option has been passed in for interleaved accesses, use it.
10570   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10571     UseInterleaved = EnableInterleavedMemAccesses;
10572 
10573   // Analyze interleaved memory accesses.
10574   if (UseInterleaved)
10575     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10576 
10577   if (LVL.hasUncountableEarlyExit()) {
10578     BasicBlock *LoopLatch = L->getLoopLatch();
10579     if (IAI.requiresScalarEpilogue() ||
10580         any_of(LVL.getCountableExitingBlocks(),
10581                [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10582       reportVectorizationFailure("Auto-vectorization of early exit loops "
10583                                  "requiring a scalar epilogue is unsupported",
10584                                  "UncountableEarlyExitUnsupported", ORE, L);
10585       return false;
10586     }
10587   }
10588 
10589   // Check the function attributes and profiles to find out if this function
10590   // should be optimized for size.
10591   ScalarEpilogueLowering SEL =
10592       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10593 
10594   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10595   // count by optimizing for size, to minimize overheads.
10596   auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10597   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10598     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10599                       << "This loop is worth vectorizing only if no scalar "
10600                       << "iteration overheads are incurred.");
10601     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10602       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10603     else {
10604       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10605         LLVM_DEBUG(dbgs() << "\n");
10606         // Predicate tail-folded loops are efficient even when the loop
10607         // iteration count is low. However, setting the epilogue policy to
10608         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10609         // with runtime checks. It's more effective to let
10610         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10611         // for the loop.
10612         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10613           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10614       } else {
10615         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10616                              "small to consider vectorizing.\n");
10617         reportVectorizationFailure(
10618             "The trip count is below the minial threshold value.",
10619             "loop trip count is too low, avoiding vectorization",
10620             "LowTripCount", ORE, L);
10621         Hints.emitRemarkWithHints();
10622         return false;
10623       }
10624     }
10625   }
10626 
10627   // Check the function attributes to see if implicit floats or vectors are
10628   // allowed.
10629   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10630     reportVectorizationFailure(
10631         "Can't vectorize when the NoImplicitFloat attribute is used",
10632         "loop not vectorized due to NoImplicitFloat attribute",
10633         "NoImplicitFloat", ORE, L);
10634     Hints.emitRemarkWithHints();
10635     return false;
10636   }
10637 
10638   // Check if the target supports potentially unsafe FP vectorization.
10639   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10640   // for the target we're vectorizing for, to make sure none of the
10641   // additional fp-math flags can help.
10642   if (Hints.isPotentiallyUnsafe() &&
10643       TTI->isFPVectorizationPotentiallyUnsafe()) {
10644     reportVectorizationFailure(
10645         "Potentially unsafe FP op prevents vectorization",
10646         "loop not vectorized due to unsafe FP support.",
10647         "UnsafeFP", ORE, L);
10648     Hints.emitRemarkWithHints();
10649     return false;
10650   }
10651 
10652   bool AllowOrderedReductions;
10653   // If the flag is set, use that instead and override the TTI behaviour.
10654   if (ForceOrderedReductions.getNumOccurrences() > 0)
10655     AllowOrderedReductions = ForceOrderedReductions;
10656   else
10657     AllowOrderedReductions = TTI->enableOrderedReductions();
10658   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10659     ORE->emit([&]() {
10660       auto *ExactFPMathInst = Requirements.getExactFPInst();
10661       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10662                                                  ExactFPMathInst->getDebugLoc(),
10663                                                  ExactFPMathInst->getParent())
10664              << "loop not vectorized: cannot prove it is safe to reorder "
10665                 "floating-point operations";
10666     });
10667     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10668                          "reorder floating-point operations\n");
10669     Hints.emitRemarkWithHints();
10670     return false;
10671   }
10672 
10673   // Use the cost model.
10674   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10675                                 F, &Hints, IAI);
10676   // Use the planner for vectorization.
10677   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10678                                ORE);
10679 
10680   // Get user vectorization factor and interleave count.
10681   ElementCount UserVF = Hints.getWidth();
10682   unsigned UserIC = Hints.getInterleave();
10683 
10684   // Plan how to best vectorize.
10685   LVP.plan(UserVF, UserIC);
10686   VectorizationFactor VF = LVP.computeBestVF();
10687   unsigned IC = 1;
10688 
10689   if (ORE->allowExtraAnalysis(LV_NAME))
10690     LVP.emitInvalidCostRemarks(ORE);
10691 
10692   bool AddBranchWeights =
10693       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10694   GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10695                            AddBranchWeights);
10696   if (LVP.hasPlanWithVF(VF.Width)) {
10697     // Select the interleave count.
10698     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10699 
10700     unsigned SelectedIC = std::max(IC, UserIC);
10701     //  Optimistically generate runtime checks if they are needed. Drop them if
10702     //  they turn out to not be profitable.
10703     if (VF.Width.isVector() || SelectedIC > 1)
10704       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10705 
10706     // Check if it is profitable to vectorize with runtime checks.
10707     bool ForceVectorization =
10708         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10709     if (!ForceVectorization &&
10710         !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10711       ORE->emit([&]() {
10712         return OptimizationRemarkAnalysisAliasing(
10713                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10714                    L->getHeader())
10715                << "loop not vectorized: cannot prove it is safe to reorder "
10716                   "memory operations";
10717       });
10718       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10719       Hints.emitRemarkWithHints();
10720       return false;
10721     }
10722   }
10723 
10724   // Identify the diagnostic messages that should be produced.
10725   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10726   bool VectorizeLoop = true, InterleaveLoop = true;
10727   if (VF.Width.isScalar()) {
10728     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10729     VecDiagMsg = std::make_pair(
10730         "VectorizationNotBeneficial",
10731         "the cost-model indicates that vectorization is not beneficial");
10732     VectorizeLoop = false;
10733   }
10734 
10735   if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10736     // Tell the user interleaving was avoided up-front, despite being explicitly
10737     // requested.
10738     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10739                          "interleaving should be avoided up front\n");
10740     IntDiagMsg = std::make_pair(
10741         "InterleavingAvoided",
10742         "Ignoring UserIC, because interleaving was avoided up front");
10743     InterleaveLoop = false;
10744   } else if (IC == 1 && UserIC <= 1) {
10745     // Tell the user interleaving is not beneficial.
10746     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10747     IntDiagMsg = std::make_pair(
10748         "InterleavingNotBeneficial",
10749         "the cost-model indicates that interleaving is not beneficial");
10750     InterleaveLoop = false;
10751     if (UserIC == 1) {
10752       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10753       IntDiagMsg.second +=
10754           " and is explicitly disabled or interleave count is set to 1";
10755     }
10756   } else if (IC > 1 && UserIC == 1) {
10757     // Tell the user interleaving is beneficial, but it explicitly disabled.
10758     LLVM_DEBUG(
10759         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10760     IntDiagMsg = std::make_pair(
10761         "InterleavingBeneficialButDisabled",
10762         "the cost-model indicates that interleaving is beneficial "
10763         "but is explicitly disabled or interleave count is set to 1");
10764     InterleaveLoop = false;
10765   }
10766 
10767   // If there is a histogram in the loop, do not just interleave without
10768   // vectorizing. The order of operations will be incorrect without the
10769   // histogram intrinsics, which are only used for recipes with VF > 1.
10770   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10771     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10772                       << "to histogram operations.\n");
10773     IntDiagMsg = std::make_pair(
10774         "HistogramPreventsScalarInterleaving",
10775         "Unable to interleave without vectorization due to constraints on "
10776         "the order of histogram operations");
10777     InterleaveLoop = false;
10778   }
10779 
10780   // Override IC if user provided an interleave count.
10781   IC = UserIC > 0 ? UserIC : IC;
10782 
10783   // Emit diagnostic messages, if any.
10784   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10785   if (!VectorizeLoop && !InterleaveLoop) {
10786     // Do not vectorize or interleaving the loop.
10787     ORE->emit([&]() {
10788       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10789                                       L->getStartLoc(), L->getHeader())
10790              << VecDiagMsg.second;
10791     });
10792     ORE->emit([&]() {
10793       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10794                                       L->getStartLoc(), L->getHeader())
10795              << IntDiagMsg.second;
10796     });
10797     return false;
10798   }
10799 
10800   if (!VectorizeLoop && InterleaveLoop) {
10801     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10802     ORE->emit([&]() {
10803       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10804                                         L->getStartLoc(), L->getHeader())
10805              << VecDiagMsg.second;
10806     });
10807   } else if (VectorizeLoop && !InterleaveLoop) {
10808     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10809                       << ") in " << L->getLocStr() << '\n');
10810     ORE->emit([&]() {
10811       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10812                                         L->getStartLoc(), L->getHeader())
10813              << IntDiagMsg.second;
10814     });
10815   } else if (VectorizeLoop && InterleaveLoop) {
10816     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10817                       << ") in " << L->getLocStr() << '\n');
10818     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10819   }
10820 
10821   bool DisableRuntimeUnroll = false;
10822   MDNode *OrigLoopID = L->getLoopID();
10823   {
10824     using namespace ore;
10825     if (!VectorizeLoop) {
10826       assert(IC > 1 && "interleave count should not be 1 or 0");
10827       // If we decided that it is not legal to vectorize the loop, then
10828       // interleave it.
10829       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10830       InnerLoopVectorizer Unroller(
10831           L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10832           ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10833 
10834       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10835 
10836       ORE->emit([&]() {
10837         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10838                                   L->getHeader())
10839                << "interleaved loop (interleaved count: "
10840                << NV("InterleaveCount", IC) << ")";
10841       });
10842     } else {
10843       // If we decided that it is *legal* to vectorize the loop, then do it.
10844 
10845       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10846       // Consider vectorizing the epilogue too if it's profitable.
10847       VectorizationFactor EpilogueVF =
10848           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10849       if (EpilogueVF.Width.isVector()) {
10850         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10851 
10852         // The first pass vectorizes the main loop and creates a scalar epilogue
10853         // to be vectorized by executing the plan (potentially with a different
10854         // factor) again shortly afterwards.
10855         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10856         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10857         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10858                                           BestEpiPlan);
10859         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10860                                            EPI, &LVL, &CM, BFI, PSI, Checks,
10861                                            *BestMainPlan);
10862         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10863                                              *BestMainPlan, MainILV, DT, false);
10864         ++LoopsVectorized;
10865 
10866         // Second pass vectorizes the epilogue and adjusts the control flow
10867         // edges from the first pass.
10868         EPI.MainLoopVF = EPI.EpilogueVF;
10869         EPI.MainLoopUF = EPI.EpilogueUF;
10870         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10871                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10872                                                  Checks, BestEpiPlan);
10873         EpilogILV.setTripCount(MainILV.getTripCount());
10874         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10875 
10876         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10877                         DT, true, &ExpandedSCEVs);
10878         ++LoopsEpilogueVectorized;
10879 
10880         if (!MainILV.areSafetyChecksAdded())
10881           DisableRuntimeUnroll = true;
10882       } else {
10883         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10884                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10885                                PSI, Checks, BestPlan);
10886         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10887         ++LoopsVectorized;
10888 
10889         // Add metadata to disable runtime unrolling a scalar loop when there
10890         // are no runtime checks about strides and memory. A scalar loop that is
10891         // rarely used is not worth unrolling.
10892         if (!LB.areSafetyChecksAdded())
10893           DisableRuntimeUnroll = true;
10894       }
10895       // Report the vectorization decision.
10896       reportVectorization(ORE, L, VF, IC);
10897     }
10898 
10899     if (ORE->allowExtraAnalysis(LV_NAME))
10900       checkMixedPrecision(L, ORE);
10901   }
10902 
10903   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10904          "DT not preserved correctly");
10905 
10906   std::optional<MDNode *> RemainderLoopID =
10907       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10908                                       LLVMLoopVectorizeFollowupEpilogue});
10909   if (RemainderLoopID) {
10910     L->setLoopID(*RemainderLoopID);
10911   } else {
10912     if (DisableRuntimeUnroll)
10913       addRuntimeUnrollDisableMetaData(L);
10914 
10915     // Mark the loop as already vectorized to avoid vectorizing again.
10916     Hints.setAlreadyVectorized();
10917   }
10918 
10919   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10920   return true;
10921 }
10922 
10923 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10924 
10925   // Don't attempt if
10926   // 1. the target claims to have no vector registers, and
10927   // 2. interleaving won't help ILP.
10928   //
10929   // The second condition is necessary because, even if the target has no
10930   // vector registers, loop vectorization may still enable scalar
10931   // interleaving.
10932   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10933       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10934     return LoopVectorizeResult(false, false);
10935 
10936   bool Changed = false, CFGChanged = false;
10937 
10938   // The vectorizer requires loops to be in simplified form.
10939   // Since simplification may add new inner loops, it has to run before the
10940   // legality and profitability checks. This means running the loop vectorizer
10941   // will simplify all loops, regardless of whether anything end up being
10942   // vectorized.
10943   for (const auto &L : *LI)
10944     Changed |= CFGChanged |=
10945         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10946 
10947   // Build up a worklist of inner-loops to vectorize. This is necessary as
10948   // the act of vectorizing or partially unrolling a loop creates new loops
10949   // and can invalidate iterators across the loops.
10950   SmallVector<Loop *, 8> Worklist;
10951 
10952   for (Loop *L : *LI)
10953     collectSupportedLoops(*L, LI, ORE, Worklist);
10954 
10955   LoopsAnalyzed += Worklist.size();
10956 
10957   // Now walk the identified inner loops.
10958   while (!Worklist.empty()) {
10959     Loop *L = Worklist.pop_back_val();
10960 
10961     // For the inner loops we actually process, form LCSSA to simplify the
10962     // transform.
10963     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10964 
10965     Changed |= CFGChanged |= processLoop(L);
10966 
10967     if (Changed) {
10968       LAIs->clear();
10969 
10970 #ifndef NDEBUG
10971       if (VerifySCEV)
10972         SE->verify();
10973 #endif
10974     }
10975   }
10976 
10977   // Process each loop nest in the function.
10978   return LoopVectorizeResult(Changed, CFGChanged);
10979 }
10980 
10981 PreservedAnalyses LoopVectorizePass::run(Function &F,
10982                                          FunctionAnalysisManager &AM) {
10983   LI = &AM.getResult<LoopAnalysis>(F);
10984   // There are no loops in the function. Return before computing other
10985   // expensive analyses.
10986   if (LI->empty())
10987     return PreservedAnalyses::all();
10988   SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
10989   TTI = &AM.getResult<TargetIRAnalysis>(F);
10990   DT = &AM.getResult<DominatorTreeAnalysis>(F);
10991   TLI = &AM.getResult<TargetLibraryAnalysis>(F);
10992   AC = &AM.getResult<AssumptionAnalysis>(F);
10993   DB = &AM.getResult<DemandedBitsAnalysis>(F);
10994   ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10995   LAIs = &AM.getResult<LoopAccessAnalysis>(F);
10996 
10997   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10998   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10999   BFI = nullptr;
11000   if (PSI && PSI->hasProfileSummary())
11001     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
11002   LoopVectorizeResult Result = runImpl(F);
11003   if (!Result.MadeAnyChange)
11004     return PreservedAnalyses::all();
11005   PreservedAnalyses PA;
11006 
11007   if (isAssignmentTrackingEnabled(*F.getParent())) {
11008     for (auto &BB : F)
11009       RemoveRedundantDbgInstrs(&BB);
11010   }
11011 
11012   PA.preserve<LoopAnalysis>();
11013   PA.preserve<DominatorTreeAnalysis>();
11014   PA.preserve<ScalarEvolutionAnalysis>();
11015   PA.preserve<LoopAccessAnalysis>();
11016 
11017   if (Result.MadeCFGChange) {
11018     // Making CFG changes likely means a loop got vectorized. Indicate that
11019     // extra simplification passes should be run.
11020     // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
11021     // be run if runtime checks have been added.
11022     AM.getResult<ShouldRunExtraVectorPasses>(F);
11023     PA.preserve<ShouldRunExtraVectorPasses>();
11024   } else {
11025     PA.preserveSet<CFGAnalyses>();
11026   }
11027   return PA;
11028 }
11029 
11030 void LoopVectorizePass::printPipeline(
11031     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
11032   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
11033       OS, MapClassName2PassName);
11034 
11035   OS << '<';
11036   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
11037   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
11038   OS << '>';
11039 }
11040