xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision cb14a3fe5122c879eae1fb480ed7ce82a699ddb6)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/STLExtras.h"
70 #include "llvm/ADT/SmallPtrSet.h"
71 #include "llvm/ADT/SmallSet.h"
72 #include "llvm/ADT/SmallVector.h"
73 #include "llvm/ADT/Statistic.h"
74 #include "llvm/ADT/StringRef.h"
75 #include "llvm/ADT/Twine.h"
76 #include "llvm/ADT/iterator_range.h"
77 #include "llvm/Analysis/AssumptionCache.h"
78 #include "llvm/Analysis/BasicAliasAnalysis.h"
79 #include "llvm/Analysis/BlockFrequencyInfo.h"
80 #include "llvm/Analysis/CFG.h"
81 #include "llvm/Analysis/CodeMetrics.h"
82 #include "llvm/Analysis/DemandedBits.h"
83 #include "llvm/Analysis/GlobalsModRef.h"
84 #include "llvm/Analysis/LoopAccessAnalysis.h"
85 #include "llvm/Analysis/LoopAnalysisManager.h"
86 #include "llvm/Analysis/LoopInfo.h"
87 #include "llvm/Analysis/LoopIterator.h"
88 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
89 #include "llvm/Analysis/ProfileSummaryInfo.h"
90 #include "llvm/Analysis/ScalarEvolution.h"
91 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
92 #include "llvm/Analysis/TargetLibraryInfo.h"
93 #include "llvm/Analysis/TargetTransformInfo.h"
94 #include "llvm/Analysis/ValueTracking.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfo.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/MDBuilder.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/ProfDataUtils.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cmath>
146 #include <cstdint>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <map>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(
251             TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252             "data-and-control-without-rt-check",
253             "Similar to data-and-control, but remove the runtime check")));
254 
255 static cl::opt<bool> MaximizeBandwidth(
256     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
257     cl::desc("Maximize bandwidth when selecting vectorization factor which "
258              "will be determined by the smallest type in loop."));
259 
260 static cl::opt<bool> EnableInterleavedMemAccesses(
261     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
262     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
263 
264 /// An interleave-group may need masking if it resides in a block that needs
265 /// predication, or in order to mask away gaps.
266 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
267     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
269 
270 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272     cl::desc("We don't interleave loops with a estimated constant trip count "
273              "below this number"));
274 
275 static cl::opt<unsigned> ForceTargetNumScalarRegs(
276     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's number of scalar registers."));
278 
279 static cl::opt<unsigned> ForceTargetNumVectorRegs(
280     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
281     cl::desc("A flag that overrides the target's number of vector registers."));
282 
283 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
284     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
285     cl::desc("A flag that overrides the target's max interleave factor for "
286              "scalar loops."));
287 
288 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
289     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
290     cl::desc("A flag that overrides the target's max interleave factor for "
291              "vectorized loops."));
292 
293 static cl::opt<unsigned> ForceTargetInstructionCost(
294     "force-target-instruction-cost", cl::init(0), cl::Hidden,
295     cl::desc("A flag that overrides the target's expected cost for "
296              "an instruction to a single constant value. Mostly "
297              "useful for getting consistent testing."));
298 
299 static cl::opt<bool> ForceTargetSupportsScalableVectors(
300     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
301     cl::desc(
302         "Pretend that scalable vectors are supported, even if the target does "
303         "not support them. This flag should only be used for testing."));
304 
305 static cl::opt<unsigned> SmallLoopCost(
306     "small-loop-cost", cl::init(20), cl::Hidden,
307     cl::desc(
308         "The cost of a loop that is considered 'small' by the interleaver."));
309 
310 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
311     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
312     cl::desc("Enable the use of the block frequency analysis to access PGO "
313              "heuristics minimizing code growth in cold regions and being more "
314              "aggressive in hot regions."));
315 
316 // Runtime interleave loops for load/store throughput.
317 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
318     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
319     cl::desc(
320         "Enable runtime interleaving until load/store ports are saturated"));
321 
322 /// Interleave small loops with scalar reductions.
323 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
324     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
325     cl::desc("Enable interleaving for loops with small iteration counts that "
326              "contain scalar reductions to expose ILP."));
327 
328 /// The number of stores in a loop that are allowed to need predication.
329 static cl::opt<unsigned> NumberOfStoresToPredicate(
330     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
331     cl::desc("Max number of stores to be predicated behind an if."));
332 
333 static cl::opt<bool> EnableIndVarRegisterHeur(
334     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
335     cl::desc("Count the induction variable only once when interleaving"));
336 
337 static cl::opt<bool> EnableCondStoresVectorization(
338     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
339     cl::desc("Enable if predication of stores during vectorization."));
340 
341 static cl::opt<unsigned> MaxNestedScalarReductionIC(
342     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
343     cl::desc("The maximum interleave count to use when interleaving a scalar "
344              "reduction in a nested loop."));
345 
346 static cl::opt<bool>
347     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
348                            cl::Hidden,
349                            cl::desc("Prefer in-loop vector reductions, "
350                                     "overriding the targets preference."));
351 
352 static cl::opt<bool> ForceOrderedReductions(
353     "force-ordered-reductions", cl::init(false), cl::Hidden,
354     cl::desc("Enable the vectorisation of loops with in-order (strict) "
355              "FP reductions"));
356 
357 static cl::opt<bool> PreferPredicatedReductionSelect(
358     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
359     cl::desc(
360         "Prefer predicating a reduction operation over an after loop select."));
361 
362 namespace llvm {
363 cl::opt<bool> EnableVPlanNativePath(
364     "enable-vplan-native-path", cl::Hidden,
365     cl::desc("Enable VPlan-native vectorization path with "
366              "support for outer loop vectorization."));
367 }
368 
369 // This flag enables the stress testing of the VPlan H-CFG construction in the
370 // VPlan-native vectorization path. It must be used in conjuction with
371 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372 // verification of the H-CFGs built.
373 static cl::opt<bool> VPlanBuildStressTest(
374     "vplan-build-stress-test", cl::init(false), cl::Hidden,
375     cl::desc(
376         "Build VPlan for every supported loop nest in the function and bail "
377         "out right after the build (stress test the VPlan H-CFG construction "
378         "in the VPlan-native vectorization path)."));
379 
380 cl::opt<bool> llvm::EnableLoopInterleaving(
381     "interleave-loops", cl::init(true), cl::Hidden,
382     cl::desc("Enable loop interleaving in Loop vectorization passes"));
383 cl::opt<bool> llvm::EnableLoopVectorization(
384     "vectorize-loops", cl::init(true), cl::Hidden,
385     cl::desc("Run the Loop vectorization passes"));
386 
387 static cl::opt<bool> PrintVPlansInDotFormat(
388     "vplan-print-in-dot-format", cl::Hidden,
389     cl::desc("Use dot format instead of plain text when dumping VPlans"));
390 
391 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
392     "force-widen-divrem-via-safe-divisor", cl::Hidden,
393     cl::desc(
394         "Override cost based safe divisor widening for div/rem instructions"));
395 
396 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
397     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398     cl::Hidden,
399     cl::desc("Try wider VFs if they enable the use of vector variants"));
400 
401 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402 // variables not overflowing do not hold. See `emitSCEVChecks`.
403 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
405 // `emitMemRuntimeChecks`.
406 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407 // Likelyhood of bypassing the vectorized loop because there are zero trips left
408 // after prolog. See `emitIterationCountCheck`.
409 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410 
411 /// A helper function that returns true if the given type is irregular. The
412 /// type is irregular if its allocated size doesn't equal the store size of an
413 /// element of the corresponding vector type.
414 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
415   // Determine if an array of N elements of type Ty is "bitcast compatible"
416   // with a <N x Ty> vector.
417   // This is only true if there is no padding between the array elements.
418   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
419 }
420 
421 /// A helper function that returns the reciprocal of the block probability of
422 /// predicated blocks. If we return X, we are assuming the predicated block
423 /// will execute once for every X iterations of the loop header.
424 ///
425 /// TODO: We should use actual block probability here, if available. Currently,
426 ///       we always assume predicated blocks have a 50% chance of executing.
427 static unsigned getReciprocalPredBlockProb() { return 2; }
428 
429 /// Returns "best known" trip count for the specified loop \p L as defined by
430 /// the following procedure:
431 ///   1) Returns exact trip count if it is known.
432 ///   2) Returns expected trip count according to profile data if any.
433 ///   3) Returns upper bound estimate if it is known.
434 ///   4) Returns std::nullopt if all of the above failed.
435 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
436                                                    Loop *L) {
437   // Check if exact trip count is known.
438   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
439     return ExpectedTC;
440 
441   // Check if there is an expected trip count available from profile data.
442   if (LoopVectorizeWithBlockFrequency)
443     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
444       return *EstimatedTC;
445 
446   // Check if upper bound estimate is known.
447   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
448     return ExpectedTC;
449 
450   return std::nullopt;
451 }
452 
453 /// Return a vector containing interleaved elements from multiple
454 /// smaller input vectors.
455 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
456                                 const Twine &Name) {
457   unsigned Factor = Vals.size();
458   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
459 
460   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
461 #ifndef NDEBUG
462   for (Value *Val : Vals)
463     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
464 #endif
465 
466   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
467   // must use intrinsics to interleave.
468   if (VecTy->isScalableTy()) {
469     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
470     return Builder.CreateIntrinsic(
471         WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
472         /*FMFSource=*/nullptr, Name);
473   }
474 
475   // Fixed length. Start by concatenating all vectors into a wide vector.
476   Value *WideVec = concatenateVectors(Builder, Vals);
477 
478   // Interleave the elements into the wide vector.
479   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
480   return Builder.CreateShuffleVector(
481       WideVec, createInterleaveMask(NumElts, Factor), Name);
482 }
483 
484 namespace {
485 // Forward declare GeneratedRTChecks.
486 class GeneratedRTChecks;
487 
488 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489 } // namespace
490 
491 namespace llvm {
492 
493 AnalysisKey ShouldRunExtraVectorPasses::Key;
494 
495 /// InnerLoopVectorizer vectorizes loops which contain only one basic
496 /// block to a specified vectorization factor (VF).
497 /// This class performs the widening of scalars into vectors, or multiple
498 /// scalars. This class also implements the following features:
499 /// * It inserts an epilogue loop for handling loops that don't have iteration
500 ///   counts that are known to be a multiple of the vectorization factor.
501 /// * It handles the code generation for reduction variables.
502 /// * Scalarization (implementation using scalars) of un-vectorizable
503 ///   instructions.
504 /// InnerLoopVectorizer does not perform any vectorization-legality
505 /// checks, and relies on the caller to check for the different legality
506 /// aspects. The InnerLoopVectorizer relies on the
507 /// LoopVectorizationLegality class to provide information about the induction
508 /// and reduction variables that were found to a given vectorization factor.
509 class InnerLoopVectorizer {
510 public:
511   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
512                       LoopInfo *LI, DominatorTree *DT,
513                       const TargetLibraryInfo *TLI,
514                       const TargetTransformInfo *TTI, AssumptionCache *AC,
515                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
516                       ElementCount MinProfitableTripCount,
517                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
518                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
519                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
520       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
521         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
522         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
523         PSI(PSI), RTChecks(RTChecks) {
524     // Query this against the original loop and save it here because the profile
525     // of the original loop header may change as the transformation happens.
526     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
527         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
528 
529     if (MinProfitableTripCount.isZero())
530       this->MinProfitableTripCount = VecWidth;
531     else
532       this->MinProfitableTripCount = MinProfitableTripCount;
533   }
534 
535   virtual ~InnerLoopVectorizer() = default;
536 
537   /// Create a new empty loop that will contain vectorized instructions later
538   /// on, while the old loop will be used as the scalar remainder. Control flow
539   /// is generated around the vectorized (and scalar epilogue) loops consisting
540   /// of various checks and bypasses. Return the pre-header block of the new
541   /// loop and the start value for the canonical induction, if it is != 0. The
542   /// latter is the case when vectorizing the epilogue loop. In the case of
543   /// epilogue vectorization, this function is overriden to handle the more
544   /// complex control flow around the loops.  \p ExpandedSCEVs is used to
545   /// look up SCEV expansions for expressions needed during skeleton creation.
546   virtual std::pair<BasicBlock *, Value *>
547   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
548 
549   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
550   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
551 
552   // Return true if any runtime check is added.
553   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
554 
555   /// A type for vectorized values in the new loop. Each value from the
556   /// original loop, when vectorized, is represented by UF vector values in the
557   /// new unrolled loop, where UF is the unroll factor.
558   using VectorParts = SmallVector<Value *, 2>;
559 
560   /// A helper function to scalarize a single Instruction in the innermost loop.
561   /// Generates a sequence of scalar instances for each lane between \p MinLane
562   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
563   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
564   /// Instr's operands.
565   void scalarizeInstruction(const Instruction *Instr,
566                             VPReplicateRecipe *RepRecipe,
567                             const VPIteration &Instance,
568                             VPTransformState &State);
569 
570   /// Try to vectorize interleaved access group \p Group with the base address
571   /// given in \p Addr, optionally masking the vector operations if \p
572   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
573   /// values in the vectorized loop.
574   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
575                                 ArrayRef<VPValue *> VPDefs,
576                                 VPTransformState &State, VPValue *Addr,
577                                 ArrayRef<VPValue *> StoredValues,
578                                 VPValue *BlockInMask, bool NeedsMaskForGaps);
579 
580   /// Fix the non-induction PHIs in \p Plan.
581   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
582 
583   /// Returns true if the reordering of FP operations is not allowed, but we are
584   /// able to vectorize with strict in-order reductions for the given RdxDesc.
585   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
586 
587   // Returns the resume value (bc.merge.rdx) for a reduction as
588   // generated by fixReduction.
589   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
590 
591   /// Create a new phi node for the induction variable \p OrigPhi to resume
592   /// iteration count in the scalar epilogue, from where the vectorized loop
593   /// left off. \p Step is the SCEV-expanded induction step to use. In cases
594   /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
595   /// and the resume values can come from an additional bypass block, the \p
596   /// AdditionalBypass pair provides information about the bypass block and the
597   /// end value on the edge from bypass to this loop.
598   PHINode *createInductionResumeValue(
599       PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
600       ArrayRef<BasicBlock *> BypassBlocks,
601       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
602 
603   /// Returns the original loop trip count.
604   Value *getTripCount() const { return TripCount; }
605 
606   /// Used to set the trip count after ILV's construction and after the
607   /// preheader block has been executed. Note that this always holds the trip
608   /// count of the original loop for both main loop and epilogue vectorization.
609   void setTripCount(Value *TC) { TripCount = TC; }
610 
611 protected:
612   friend class LoopVectorizationPlanner;
613 
614   /// A small list of PHINodes.
615   using PhiVector = SmallVector<PHINode *, 4>;
616 
617   /// A type for scalarized values in the new loop. Each value from the
618   /// original loop, when scalarized, is represented by UF x VF scalar values
619   /// in the new unrolled loop, where UF is the unroll factor and VF is the
620   /// vectorization factor.
621   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
622 
623   /// Set up the values of the IVs correctly when exiting the vector loop.
624   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
625                     Value *VectorTripCount, Value *EndValue,
626                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
627                     VPlan &Plan, VPTransformState &State);
628 
629   /// Handle all cross-iteration phis in the header.
630   void fixCrossIterationPHIs(VPTransformState &State);
631 
632   /// Create the exit value of first order recurrences in the middle block and
633   /// update their users.
634   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
635                                VPTransformState &State);
636 
637   /// Create code for the loop exit value of the reduction.
638   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
639 
640   /// Iteratively sink the scalarized operands of a predicated instruction into
641   /// the block that was created for it.
642   void sinkScalarOperands(Instruction *PredInst);
643 
644   /// Returns (and creates if needed) the trip count of the widened loop.
645   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
646 
647   /// Returns a bitcasted value to the requested vector type.
648   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
649   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
650                                 const DataLayout &DL);
651 
652   /// Emit a bypass check to see if the vector trip count is zero, including if
653   /// it overflows.
654   void emitIterationCountCheck(BasicBlock *Bypass);
655 
656   /// Emit a bypass check to see if all of the SCEV assumptions we've
657   /// had to make are correct. Returns the block containing the checks or
658   /// nullptr if no checks have been added.
659   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
660 
661   /// Emit bypass checks to check any memory assumptions we may have made.
662   /// Returns the block containing the checks or nullptr if no checks have been
663   /// added.
664   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
665 
666   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
667   /// vector loop preheader, middle block and scalar preheader.
668   void createVectorLoopSkeleton(StringRef Prefix);
669 
670   /// Create new phi nodes for the induction variables to resume iteration count
671   /// in the scalar epilogue, from where the vectorized loop left off.
672   /// In cases where the loop skeleton is more complicated (eg. epilogue
673   /// vectorization) and the resume values can come from an additional bypass
674   /// block, the \p AdditionalBypass pair provides information about the bypass
675   /// block and the end value on the edge from bypass to this loop.
676   void createInductionResumeValues(
677       const SCEV2ValueTy &ExpandedSCEVs,
678       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
679 
680   /// Complete the loop skeleton by adding debug MDs, creating appropriate
681   /// conditional branches in the middle block, preparing the builder and
682   /// running the verifier. Return the preheader of the completed vector loop.
683   BasicBlock *completeLoopSkeleton();
684 
685   /// Collect poison-generating recipes that may generate a poison value that is
686   /// used after vectorization, even when their operands are not poison. Those
687   /// recipes meet the following conditions:
688   ///  * Contribute to the address computation of a recipe generating a widen
689   ///    memory load/store (VPWidenMemoryInstructionRecipe or
690   ///    VPInterleaveRecipe).
691   ///  * Such a widen memory load/store has at least one underlying Instruction
692   ///    that is in a basic block that needs predication and after vectorization
693   ///    the generated instruction won't be predicated.
694   void collectPoisonGeneratingRecipes(VPTransformState &State);
695 
696   /// Allow subclasses to override and print debug traces before/after vplan
697   /// execution, when trace information is requested.
698   virtual void printDebugTracesAtStart(){};
699   virtual void printDebugTracesAtEnd(){};
700 
701   /// The original loop.
702   Loop *OrigLoop;
703 
704   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
705   /// dynamic knowledge to simplify SCEV expressions and converts them to a
706   /// more usable form.
707   PredicatedScalarEvolution &PSE;
708 
709   /// Loop Info.
710   LoopInfo *LI;
711 
712   /// Dominator Tree.
713   DominatorTree *DT;
714 
715   /// Target Library Info.
716   const TargetLibraryInfo *TLI;
717 
718   /// Target Transform Info.
719   const TargetTransformInfo *TTI;
720 
721   /// Assumption Cache.
722   AssumptionCache *AC;
723 
724   /// Interface to emit optimization remarks.
725   OptimizationRemarkEmitter *ORE;
726 
727   /// The vectorization SIMD factor to use. Each vector will have this many
728   /// vector elements.
729   ElementCount VF;
730 
731   ElementCount MinProfitableTripCount;
732 
733   /// The vectorization unroll factor to use. Each scalar is vectorized to this
734   /// many different vector instructions.
735   unsigned UF;
736 
737   /// The builder that we use
738   IRBuilder<> Builder;
739 
740   // --- Vectorization state ---
741 
742   /// The vector-loop preheader.
743   BasicBlock *LoopVectorPreHeader;
744 
745   /// The scalar-loop preheader.
746   BasicBlock *LoopScalarPreHeader;
747 
748   /// Middle Block between the vector and the scalar.
749   BasicBlock *LoopMiddleBlock;
750 
751   /// The unique ExitBlock of the scalar loop if one exists.  Note that
752   /// there can be multiple exiting edges reaching this block.
753   BasicBlock *LoopExitBlock;
754 
755   /// The scalar loop body.
756   BasicBlock *LoopScalarBody;
757 
758   /// A list of all bypass blocks. The first block is the entry of the loop.
759   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
760 
761   /// Store instructions that were predicated.
762   SmallVector<Instruction *, 4> PredicatedInstructions;
763 
764   /// Trip count of the original loop.
765   Value *TripCount = nullptr;
766 
767   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
768   Value *VectorTripCount = nullptr;
769 
770   /// The legality analysis.
771   LoopVectorizationLegality *Legal;
772 
773   /// The profitablity analysis.
774   LoopVectorizationCostModel *Cost;
775 
776   // Record whether runtime checks are added.
777   bool AddedSafetyChecks = false;
778 
779   // Holds the end values for each induction variable. We save the end values
780   // so we can later fix-up the external users of the induction variables.
781   DenseMap<PHINode *, Value *> IVEndValues;
782 
783   /// BFI and PSI are used to check for profile guided size optimizations.
784   BlockFrequencyInfo *BFI;
785   ProfileSummaryInfo *PSI;
786 
787   // Whether this loop should be optimized for size based on profile guided size
788   // optimizatios.
789   bool OptForSizeBasedOnProfile;
790 
791   /// Structure to hold information about generated runtime checks, responsible
792   /// for cleaning the checks, if vectorization turns out unprofitable.
793   GeneratedRTChecks &RTChecks;
794 
795   // Holds the resume values for reductions in the loops, used to set the
796   // correct start value of reduction PHIs when vectorizing the epilogue.
797   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
798       ReductionResumeValues;
799 };
800 
801 class InnerLoopUnroller : public InnerLoopVectorizer {
802 public:
803   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
804                     LoopInfo *LI, DominatorTree *DT,
805                     const TargetLibraryInfo *TLI,
806                     const TargetTransformInfo *TTI, AssumptionCache *AC,
807                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
808                     LoopVectorizationLegality *LVL,
809                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
810                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
811       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
812                             ElementCount::getFixed(1),
813                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
814                             BFI, PSI, Check) {}
815 };
816 
817 /// Encapsulate information regarding vectorization of a loop and its epilogue.
818 /// This information is meant to be updated and used across two stages of
819 /// epilogue vectorization.
820 struct EpilogueLoopVectorizationInfo {
821   ElementCount MainLoopVF = ElementCount::getFixed(0);
822   unsigned MainLoopUF = 0;
823   ElementCount EpilogueVF = ElementCount::getFixed(0);
824   unsigned EpilogueUF = 0;
825   BasicBlock *MainLoopIterationCountCheck = nullptr;
826   BasicBlock *EpilogueIterationCountCheck = nullptr;
827   BasicBlock *SCEVSafetyCheck = nullptr;
828   BasicBlock *MemSafetyCheck = nullptr;
829   Value *TripCount = nullptr;
830   Value *VectorTripCount = nullptr;
831 
832   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
833                                 ElementCount EVF, unsigned EUF)
834       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
835     assert(EUF == 1 &&
836            "A high UF for the epilogue loop is likely not beneficial.");
837   }
838 };
839 
840 /// An extension of the inner loop vectorizer that creates a skeleton for a
841 /// vectorized loop that has its epilogue (residual) also vectorized.
842 /// The idea is to run the vplan on a given loop twice, firstly to setup the
843 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
844 /// from the first step and vectorize the epilogue.  This is achieved by
845 /// deriving two concrete strategy classes from this base class and invoking
846 /// them in succession from the loop vectorizer planner.
847 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
848 public:
849   InnerLoopAndEpilogueVectorizer(
850       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
851       DominatorTree *DT, const TargetLibraryInfo *TLI,
852       const TargetTransformInfo *TTI, AssumptionCache *AC,
853       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
854       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
855       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
856       GeneratedRTChecks &Checks)
857       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
858                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
859                             CM, BFI, PSI, Checks),
860         EPI(EPI) {}
861 
862   // Override this function to handle the more complex control flow around the
863   // three loops.
864   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
865       const SCEV2ValueTy &ExpandedSCEVs) final {
866     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
867   }
868 
869   /// The interface for creating a vectorized skeleton using one of two
870   /// different strategies, each corresponding to one execution of the vplan
871   /// as described above.
872   virtual std::pair<BasicBlock *, Value *>
873   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
874 
875   /// Holds and updates state information required to vectorize the main loop
876   /// and its epilogue in two separate passes. This setup helps us avoid
877   /// regenerating and recomputing runtime safety checks. It also helps us to
878   /// shorten the iteration-count-check path length for the cases where the
879   /// iteration count of the loop is so small that the main vector loop is
880   /// completely skipped.
881   EpilogueLoopVectorizationInfo &EPI;
882 };
883 
884 /// A specialized derived class of inner loop vectorizer that performs
885 /// vectorization of *main* loops in the process of vectorizing loops and their
886 /// epilogues.
887 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
888 public:
889   EpilogueVectorizerMainLoop(
890       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
891       DominatorTree *DT, const TargetLibraryInfo *TLI,
892       const TargetTransformInfo *TTI, AssumptionCache *AC,
893       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
894       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
895       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
896       GeneratedRTChecks &Check)
897       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
898                                        EPI, LVL, CM, BFI, PSI, Check) {}
899   /// Implements the interface for creating a vectorized skeleton using the
900   /// *main loop* strategy (ie the first pass of vplan execution).
901   std::pair<BasicBlock *, Value *>
902   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
903 
904 protected:
905   /// Emits an iteration count bypass check once for the main loop (when \p
906   /// ForEpilogue is false) and once for the epilogue loop (when \p
907   /// ForEpilogue is true).
908   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
909   void printDebugTracesAtStart() override;
910   void printDebugTracesAtEnd() override;
911 };
912 
913 // A specialized derived class of inner loop vectorizer that performs
914 // vectorization of *epilogue* loops in the process of vectorizing loops and
915 // their epilogues.
916 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
917 public:
918   EpilogueVectorizerEpilogueLoop(
919       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
920       DominatorTree *DT, const TargetLibraryInfo *TLI,
921       const TargetTransformInfo *TTI, AssumptionCache *AC,
922       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
923       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
924       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
925       GeneratedRTChecks &Checks)
926       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
927                                        EPI, LVL, CM, BFI, PSI, Checks) {
928     TripCount = EPI.TripCount;
929   }
930   /// Implements the interface for creating a vectorized skeleton using the
931   /// *epilogue loop* strategy (ie the second pass of vplan execution).
932   std::pair<BasicBlock *, Value *>
933   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
934 
935 protected:
936   /// Emits an iteration count bypass check after the main vector loop has
937   /// finished to see if there are any iterations left to execute by either
938   /// the vector epilogue or the scalar epilogue.
939   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
940                                                       BasicBlock *Bypass,
941                                                       BasicBlock *Insert);
942   void printDebugTracesAtStart() override;
943   void printDebugTracesAtEnd() override;
944 };
945 } // end namespace llvm
946 
947 /// Look for a meaningful debug location on the instruction or it's
948 /// operands.
949 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
950   if (!I)
951     return DebugLoc();
952 
953   DebugLoc Empty;
954   if (I->getDebugLoc() != Empty)
955     return I->getDebugLoc();
956 
957   for (Use &Op : I->operands()) {
958     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
959       if (OpInst->getDebugLoc() != Empty)
960         return OpInst->getDebugLoc();
961   }
962 
963   return I->getDebugLoc();
964 }
965 
966 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
967 /// is passed, the message relates to that particular instruction.
968 #ifndef NDEBUG
969 static void debugVectorizationMessage(const StringRef Prefix,
970                                       const StringRef DebugMsg,
971                                       Instruction *I) {
972   dbgs() << "LV: " << Prefix << DebugMsg;
973   if (I != nullptr)
974     dbgs() << " " << *I;
975   else
976     dbgs() << '.';
977   dbgs() << '\n';
978 }
979 #endif
980 
981 /// Create an analysis remark that explains why vectorization failed
982 ///
983 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
984 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
985 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
986 /// the location of the remark.  \return the remark object that can be
987 /// streamed to.
988 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
989     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
990   Value *CodeRegion = TheLoop->getHeader();
991   DebugLoc DL = TheLoop->getStartLoc();
992 
993   if (I) {
994     CodeRegion = I->getParent();
995     // If there is no debug location attached to the instruction, revert back to
996     // using the loop's.
997     if (I->getDebugLoc())
998       DL = I->getDebugLoc();
999   }
1000 
1001   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1002 }
1003 
1004 namespace llvm {
1005 
1006 /// Return a value for Step multiplied by VF.
1007 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1008                        int64_t Step) {
1009   assert(Ty->isIntegerTy() && "Expected an integer step");
1010   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
1011 }
1012 
1013 /// Return the runtime value for VF.
1014 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1015   return B.CreateElementCount(Ty, VF);
1016 }
1017 
1018 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
1019                                 Loop *OrigLoop) {
1020   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
1021   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
1022 
1023   ScalarEvolution &SE = *PSE.getSE();
1024   return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
1025 }
1026 
1027 void reportVectorizationFailure(const StringRef DebugMsg,
1028                                 const StringRef OREMsg, const StringRef ORETag,
1029                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1030                                 Instruction *I) {
1031   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1032   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1033   ORE->emit(
1034       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1035       << "loop not vectorized: " << OREMsg);
1036 }
1037 
1038 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1039                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1040                              Instruction *I) {
1041   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1042   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1043   ORE->emit(
1044       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1045       << Msg);
1046 }
1047 
1048 /// Report successful vectorization of the loop. In case an outer loop is
1049 /// vectorized, prepend "outer" to the vectorization remark.
1050 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1051                                 VectorizationFactor VF, unsigned IC) {
1052   LLVM_DEBUG(debugVectorizationMessage(
1053       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1054       nullptr));
1055   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1056   ORE->emit([&]() {
1057     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1058                               TheLoop->getHeader())
1059            << "vectorized " << LoopType << "loop (vectorization width: "
1060            << ore::NV("VectorizationFactor", VF.Width)
1061            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1062   });
1063 }
1064 
1065 } // end namespace llvm
1066 
1067 #ifndef NDEBUG
1068 /// \return string containing a file name and a line # for the given loop.
1069 static std::string getDebugLocString(const Loop *L) {
1070   std::string Result;
1071   if (L) {
1072     raw_string_ostream OS(Result);
1073     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1074       LoopDbgLoc.print(OS);
1075     else
1076       // Just print the module name.
1077       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1078     OS.flush();
1079   }
1080   return Result;
1081 }
1082 #endif
1083 
1084 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1085     VPTransformState &State) {
1086 
1087   // Collect recipes in the backward slice of `Root` that may generate a poison
1088   // value that is used after vectorization.
1089   SmallPtrSet<VPRecipeBase *, 16> Visited;
1090   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1091     SmallVector<VPRecipeBase *, 16> Worklist;
1092     Worklist.push_back(Root);
1093 
1094     // Traverse the backward slice of Root through its use-def chain.
1095     while (!Worklist.empty()) {
1096       VPRecipeBase *CurRec = Worklist.back();
1097       Worklist.pop_back();
1098 
1099       if (!Visited.insert(CurRec).second)
1100         continue;
1101 
1102       // Prune search if we find another recipe generating a widen memory
1103       // instruction. Widen memory instructions involved in address computation
1104       // will lead to gather/scatter instructions, which don't need to be
1105       // handled.
1106       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1107           isa<VPInterleaveRecipe>(CurRec) ||
1108           isa<VPScalarIVStepsRecipe>(CurRec) ||
1109           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1110           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1111         continue;
1112 
1113       // This recipe contributes to the address computation of a widen
1114       // load/store. If the underlying instruction has poison-generating flags,
1115       // drop them directly.
1116       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1117         RecWithFlags->dropPoisonGeneratingFlags();
1118       } else {
1119         Instruction *Instr = dyn_cast_or_null<Instruction>(
1120             CurRec->getVPSingleValue()->getUnderlyingValue());
1121         (void)Instr;
1122         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1123                "found instruction with poison generating flags not covered by "
1124                "VPRecipeWithIRFlags");
1125       }
1126 
1127       // Add new definitions to the worklist.
1128       for (VPValue *operand : CurRec->operands())
1129         if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1130           Worklist.push_back(OpDef);
1131     }
1132   });
1133 
1134   // Traverse all the recipes in the VPlan and collect the poison-generating
1135   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1136   // VPInterleaveRecipe.
1137   auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1138   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1139     for (VPRecipeBase &Recipe : *VPBB) {
1140       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1141         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1142         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1143         if (AddrDef && WidenRec->isConsecutive() &&
1144             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1145           collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1146       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1147         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1148         if (AddrDef) {
1149           // Check if any member of the interleave group needs predication.
1150           const InterleaveGroup<Instruction> *InterGroup =
1151               InterleaveRec->getInterleaveGroup();
1152           bool NeedPredication = false;
1153           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1154                I < NumMembers; ++I) {
1155             Instruction *Member = InterGroup->getMember(I);
1156             if (Member)
1157               NeedPredication |=
1158                   Legal->blockNeedsPredication(Member->getParent());
1159           }
1160 
1161           if (NeedPredication)
1162             collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1163         }
1164       }
1165     }
1166   }
1167 }
1168 
1169 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1170     const RecurrenceDescriptor &RdxDesc) {
1171   auto It = ReductionResumeValues.find(&RdxDesc);
1172   assert(It != ReductionResumeValues.end() &&
1173          "Expected to find a resume value for the reduction.");
1174   return It->second;
1175 }
1176 
1177 namespace llvm {
1178 
1179 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1180 // lowered.
1181 enum ScalarEpilogueLowering {
1182 
1183   // The default: allowing scalar epilogues.
1184   CM_ScalarEpilogueAllowed,
1185 
1186   // Vectorization with OptForSize: don't allow epilogues.
1187   CM_ScalarEpilogueNotAllowedOptSize,
1188 
1189   // A special case of vectorisation with OptForSize: loops with a very small
1190   // trip count are considered for vectorization under OptForSize, thereby
1191   // making sure the cost of their loop body is dominant, free of runtime
1192   // guards and scalar iteration overheads.
1193   CM_ScalarEpilogueNotAllowedLowTripLoop,
1194 
1195   // Loop hint predicate indicating an epilogue is undesired.
1196   CM_ScalarEpilogueNotNeededUsePredicate,
1197 
1198   // Directive indicating we must either tail fold or not vectorize
1199   CM_ScalarEpilogueNotAllowedUsePredicate
1200 };
1201 
1202 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1203 
1204 /// LoopVectorizationCostModel - estimates the expected speedups due to
1205 /// vectorization.
1206 /// In many cases vectorization is not profitable. This can happen because of
1207 /// a number of reasons. In this class we mainly attempt to predict the
1208 /// expected speedup/slowdowns due to the supported instruction set. We use the
1209 /// TargetTransformInfo to query the different backends for the cost of
1210 /// different operations.
1211 class LoopVectorizationCostModel {
1212 public:
1213   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1214                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1215                              LoopVectorizationLegality *Legal,
1216                              const TargetTransformInfo &TTI,
1217                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1218                              AssumptionCache *AC,
1219                              OptimizationRemarkEmitter *ORE, const Function *F,
1220                              const LoopVectorizeHints *Hints,
1221                              InterleavedAccessInfo &IAI)
1222       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1223         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1224         Hints(Hints), InterleaveInfo(IAI) {}
1225 
1226   /// \return An upper bound for the vectorization factors (both fixed and
1227   /// scalable). If the factors are 0, vectorization and interleaving should be
1228   /// avoided up front.
1229   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1230 
1231   /// \return True if runtime checks are required for vectorization, and false
1232   /// otherwise.
1233   bool runtimeChecksRequired();
1234 
1235   /// Setup cost-based decisions for user vectorization factor.
1236   /// \return true if the UserVF is a feasible VF to be chosen.
1237   bool selectUserVectorizationFactor(ElementCount UserVF) {
1238     collectUniformsAndScalars(UserVF);
1239     collectInstsToScalarize(UserVF);
1240     return expectedCost(UserVF).first.isValid();
1241   }
1242 
1243   /// \return The size (in bits) of the smallest and widest types in the code
1244   /// that needs to be vectorized. We ignore values that remain scalar such as
1245   /// 64 bit loop indices.
1246   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1247 
1248   /// \return The desired interleave count.
1249   /// If interleave count has been specified by metadata it will be returned.
1250   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1251   /// are the selected vectorization factor and the cost of the selected VF.
1252   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1253 
1254   /// Memory access instruction may be vectorized in more than one way.
1255   /// Form of instruction after vectorization depends on cost.
1256   /// This function takes cost-based decisions for Load/Store instructions
1257   /// and collects them in a map. This decisions map is used for building
1258   /// the lists of loop-uniform and loop-scalar instructions.
1259   /// The calculated cost is saved with widening decision in order to
1260   /// avoid redundant calculations.
1261   void setCostBasedWideningDecision(ElementCount VF);
1262 
1263   /// A call may be vectorized in different ways depending on whether we have
1264   /// vectorized variants available and whether the target supports masking.
1265   /// This function analyzes all calls in the function at the supplied VF,
1266   /// makes a decision based on the costs of available options, and stores that
1267   /// decision in a map for use in planning and plan execution.
1268   void setVectorizedCallDecision(ElementCount VF);
1269 
1270   /// A struct that represents some properties of the register usage
1271   /// of a loop.
1272   struct RegisterUsage {
1273     /// Holds the number of loop invariant values that are used in the loop.
1274     /// The key is ClassID of target-provided register class.
1275     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1276     /// Holds the maximum number of concurrent live intervals in the loop.
1277     /// The key is ClassID of target-provided register class.
1278     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1279   };
1280 
1281   /// \return Returns information about the register usages of the loop for the
1282   /// given vectorization factors.
1283   SmallVector<RegisterUsage, 8>
1284   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1285 
1286   /// Collect values we want to ignore in the cost model.
1287   void collectValuesToIgnore();
1288 
1289   /// Collect all element types in the loop for which widening is needed.
1290   void collectElementTypesForWidening();
1291 
1292   /// Split reductions into those that happen in the loop, and those that happen
1293   /// outside. In loop reductions are collected into InLoopReductions.
1294   void collectInLoopReductions();
1295 
1296   /// Returns true if we should use strict in-order reductions for the given
1297   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1298   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1299   /// of FP operations.
1300   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1301     return !Hints->allowReordering() && RdxDesc.isOrdered();
1302   }
1303 
1304   /// \returns The smallest bitwidth each instruction can be represented with.
1305   /// The vector equivalents of these instructions should be truncated to this
1306   /// type.
1307   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1308     return MinBWs;
1309   }
1310 
1311   /// \returns True if it is more profitable to scalarize instruction \p I for
1312   /// vectorization factor \p VF.
1313   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1314     assert(VF.isVector() &&
1315            "Profitable to scalarize relevant only for VF > 1.");
1316 
1317     // Cost model is not run in the VPlan-native path - return conservative
1318     // result until this changes.
1319     if (EnableVPlanNativePath)
1320       return false;
1321 
1322     auto Scalars = InstsToScalarize.find(VF);
1323     assert(Scalars != InstsToScalarize.end() &&
1324            "VF not yet analyzed for scalarization profitability");
1325     return Scalars->second.contains(I);
1326   }
1327 
1328   /// Returns true if \p I is known to be uniform after vectorization.
1329   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1330     // Pseudo probe needs to be duplicated for each unrolled iteration and
1331     // vector lane so that profiled loop trip count can be accurately
1332     // accumulated instead of being under counted.
1333     if (isa<PseudoProbeInst>(I))
1334       return false;
1335 
1336     if (VF.isScalar())
1337       return true;
1338 
1339     // Cost model is not run in the VPlan-native path - return conservative
1340     // result until this changes.
1341     if (EnableVPlanNativePath)
1342       return false;
1343 
1344     auto UniformsPerVF = Uniforms.find(VF);
1345     assert(UniformsPerVF != Uniforms.end() &&
1346            "VF not yet analyzed for uniformity");
1347     return UniformsPerVF->second.count(I);
1348   }
1349 
1350   /// Returns true if \p I is known to be scalar after vectorization.
1351   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1352     if (VF.isScalar())
1353       return true;
1354 
1355     // Cost model is not run in the VPlan-native path - return conservative
1356     // result until this changes.
1357     if (EnableVPlanNativePath)
1358       return false;
1359 
1360     auto ScalarsPerVF = Scalars.find(VF);
1361     assert(ScalarsPerVF != Scalars.end() &&
1362            "Scalar values are not calculated for VF");
1363     return ScalarsPerVF->second.count(I);
1364   }
1365 
1366   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1367   /// for vectorization factor \p VF.
1368   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1369     return VF.isVector() && MinBWs.contains(I) &&
1370            !isProfitableToScalarize(I, VF) &&
1371            !isScalarAfterVectorization(I, VF);
1372   }
1373 
1374   /// Decision that was taken during cost calculation for memory instruction.
1375   enum InstWidening {
1376     CM_Unknown,
1377     CM_Widen,         // For consecutive accesses with stride +1.
1378     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1379     CM_Interleave,
1380     CM_GatherScatter,
1381     CM_Scalarize,
1382     CM_VectorCall,
1383     CM_IntrinsicCall
1384   };
1385 
1386   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1387   /// instruction \p I and vector width \p VF.
1388   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1389                            InstructionCost Cost) {
1390     assert(VF.isVector() && "Expected VF >=2");
1391     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1392   }
1393 
1394   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1395   /// interleaving group \p Grp and vector width \p VF.
1396   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1397                            ElementCount VF, InstWidening W,
1398                            InstructionCost Cost) {
1399     assert(VF.isVector() && "Expected VF >=2");
1400     /// Broadcast this decicion to all instructions inside the group.
1401     /// But the cost will be assigned to one instruction only.
1402     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1403       if (auto *I = Grp->getMember(i)) {
1404         if (Grp->getInsertPos() == I)
1405           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1406         else
1407           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1408       }
1409     }
1410   }
1411 
1412   /// Return the cost model decision for the given instruction \p I and vector
1413   /// width \p VF. Return CM_Unknown if this instruction did not pass
1414   /// through the cost modeling.
1415   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1416     assert(VF.isVector() && "Expected VF to be a vector VF");
1417     // Cost model is not run in the VPlan-native path - return conservative
1418     // result until this changes.
1419     if (EnableVPlanNativePath)
1420       return CM_GatherScatter;
1421 
1422     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1423     auto Itr = WideningDecisions.find(InstOnVF);
1424     if (Itr == WideningDecisions.end())
1425       return CM_Unknown;
1426     return Itr->second.first;
1427   }
1428 
1429   /// Return the vectorization cost for the given instruction \p I and vector
1430   /// width \p VF.
1431   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1432     assert(VF.isVector() && "Expected VF >=2");
1433     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1434     assert(WideningDecisions.contains(InstOnVF) &&
1435            "The cost is not calculated");
1436     return WideningDecisions[InstOnVF].second;
1437   }
1438 
1439   struct CallWideningDecision {
1440     InstWidening Kind;
1441     Function *Variant;
1442     Intrinsic::ID IID;
1443     std::optional<unsigned> MaskPos;
1444     InstructionCost Cost;
1445   };
1446 
1447   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1448                                Function *Variant, Intrinsic::ID IID,
1449                                std::optional<unsigned> MaskPos,
1450                                InstructionCost Cost) {
1451     assert(!VF.isScalar() && "Expected vector VF");
1452     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1453                                                      MaskPos, Cost};
1454   }
1455 
1456   CallWideningDecision getCallWideningDecision(CallInst *CI,
1457                                                ElementCount VF) const {
1458     assert(!VF.isScalar() && "Expected vector VF");
1459     return CallWideningDecisions.at(std::make_pair(CI, VF));
1460   }
1461 
1462   /// Return True if instruction \p I is an optimizable truncate whose operand
1463   /// is an induction variable. Such a truncate will be removed by adding a new
1464   /// induction variable with the destination type.
1465   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1466     // If the instruction is not a truncate, return false.
1467     auto *Trunc = dyn_cast<TruncInst>(I);
1468     if (!Trunc)
1469       return false;
1470 
1471     // Get the source and destination types of the truncate.
1472     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1473     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1474 
1475     // If the truncate is free for the given types, return false. Replacing a
1476     // free truncate with an induction variable would add an induction variable
1477     // update instruction to each iteration of the loop. We exclude from this
1478     // check the primary induction variable since it will need an update
1479     // instruction regardless.
1480     Value *Op = Trunc->getOperand(0);
1481     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1482       return false;
1483 
1484     // If the truncated value is not an induction variable, return false.
1485     return Legal->isInductionPhi(Op);
1486   }
1487 
1488   /// Collects the instructions to scalarize for each predicated instruction in
1489   /// the loop.
1490   void collectInstsToScalarize(ElementCount VF);
1491 
1492   /// Collect Uniform and Scalar values for the given \p VF.
1493   /// The sets depend on CM decision for Load/Store instructions
1494   /// that may be vectorized as interleave, gather-scatter or scalarized.
1495   /// Also make a decision on what to do about call instructions in the loop
1496   /// at that VF -- scalarize, call a known vector routine, or call a
1497   /// vector intrinsic.
1498   void collectUniformsAndScalars(ElementCount VF) {
1499     // Do the analysis once.
1500     if (VF.isScalar() || Uniforms.contains(VF))
1501       return;
1502     setCostBasedWideningDecision(VF);
1503     setVectorizedCallDecision(VF);
1504     collectLoopUniforms(VF);
1505     collectLoopScalars(VF);
1506   }
1507 
1508   /// Returns true if the target machine supports masked store operation
1509   /// for the given \p DataType and kind of access to \p Ptr.
1510   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1511     return Legal->isConsecutivePtr(DataType, Ptr) &&
1512            TTI.isLegalMaskedStore(DataType, Alignment);
1513   }
1514 
1515   /// Returns true if the target machine supports masked load operation
1516   /// for the given \p DataType and kind of access to \p Ptr.
1517   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1518     return Legal->isConsecutivePtr(DataType, Ptr) &&
1519            TTI.isLegalMaskedLoad(DataType, Alignment);
1520   }
1521 
1522   /// Returns true if the target machine can represent \p V as a masked gather
1523   /// or scatter operation.
1524   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1525     bool LI = isa<LoadInst>(V);
1526     bool SI = isa<StoreInst>(V);
1527     if (!LI && !SI)
1528       return false;
1529     auto *Ty = getLoadStoreType(V);
1530     Align Align = getLoadStoreAlignment(V);
1531     if (VF.isVector())
1532       Ty = VectorType::get(Ty, VF);
1533     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1534            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1535   }
1536 
1537   /// Returns true if the target machine supports all of the reduction
1538   /// variables found for the given VF.
1539   bool canVectorizeReductions(ElementCount VF) const {
1540     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1541       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1542       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1543     }));
1544   }
1545 
1546   /// Given costs for both strategies, return true if the scalar predication
1547   /// lowering should be used for div/rem.  This incorporates an override
1548   /// option so it is not simply a cost comparison.
1549   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1550                                      InstructionCost SafeDivisorCost) const {
1551     switch (ForceSafeDivisor) {
1552     case cl::BOU_UNSET:
1553       return ScalarCost < SafeDivisorCost;
1554     case cl::BOU_TRUE:
1555       return false;
1556     case cl::BOU_FALSE:
1557       return true;
1558     };
1559     llvm_unreachable("impossible case value");
1560   }
1561 
1562   /// Returns true if \p I is an instruction which requires predication and
1563   /// for which our chosen predication strategy is scalarization (i.e. we
1564   /// don't have an alternate strategy such as masking available).
1565   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1566   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1567 
1568   /// Returns true if \p I is an instruction that needs to be predicated
1569   /// at runtime.  The result is independent of the predication mechanism.
1570   /// Superset of instructions that return true for isScalarWithPredication.
1571   bool isPredicatedInst(Instruction *I) const;
1572 
1573   /// Return the costs for our two available strategies for lowering a
1574   /// div/rem operation which requires speculating at least one lane.
1575   /// First result is for scalarization (will be invalid for scalable
1576   /// vectors); second is for the safe-divisor strategy.
1577   std::pair<InstructionCost, InstructionCost>
1578   getDivRemSpeculationCost(Instruction *I,
1579                            ElementCount VF) const;
1580 
1581   /// Returns true if \p I is a memory instruction with consecutive memory
1582   /// access that can be widened.
1583   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1584 
1585   /// Returns true if \p I is a memory instruction in an interleaved-group
1586   /// of memory accesses that can be vectorized with wide vector loads/stores
1587   /// and shuffles.
1588   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1589 
1590   /// Check if \p Instr belongs to any interleaved access group.
1591   bool isAccessInterleaved(Instruction *Instr) {
1592     return InterleaveInfo.isInterleaved(Instr);
1593   }
1594 
1595   /// Get the interleaved access group that \p Instr belongs to.
1596   const InterleaveGroup<Instruction> *
1597   getInterleavedAccessGroup(Instruction *Instr) {
1598     return InterleaveInfo.getInterleaveGroup(Instr);
1599   }
1600 
1601   /// Returns true if we're required to use a scalar epilogue for at least
1602   /// the final iteration of the original loop.
1603   bool requiresScalarEpilogue(bool IsVectorizing) const {
1604     if (!isScalarEpilogueAllowed())
1605       return false;
1606     // If we might exit from anywhere but the latch, must run the exiting
1607     // iteration in scalar form.
1608     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1609       return true;
1610     return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1611   }
1612 
1613   /// Returns true if we're required to use a scalar epilogue for at least
1614   /// the final iteration of the original loop for all VFs in \p Range.
1615   /// A scalar epilogue must either be required for all VFs in \p Range or for
1616   /// none.
1617   bool requiresScalarEpilogue(VFRange Range) const {
1618     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1619       return requiresScalarEpilogue(VF.isVector());
1620     };
1621     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1622     assert(
1623         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1624         "all VFs in range must agree on whether a scalar epilogue is required");
1625     return IsRequired;
1626   }
1627 
1628   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1629   /// loop hint annotation.
1630   bool isScalarEpilogueAllowed() const {
1631     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1632   }
1633 
1634   /// Returns the TailFoldingStyle that is best for the current loop.
1635   TailFoldingStyle
1636   getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1637     if (!CanFoldTailByMasking)
1638       return TailFoldingStyle::None;
1639 
1640     if (ForceTailFoldingStyle.getNumOccurrences())
1641       return ForceTailFoldingStyle;
1642 
1643     return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
1644   }
1645 
1646   /// Returns true if all loop blocks should be masked to fold tail loop.
1647   bool foldTailByMasking() const {
1648     return getTailFoldingStyle() != TailFoldingStyle::None;
1649   }
1650 
1651   /// Returns true if the instructions in this block requires predication
1652   /// for any reason, e.g. because tail folding now requires a predicate
1653   /// or because the block in the original loop was predicated.
1654   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1655     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1656   }
1657 
1658   /// Returns true if the Phi is part of an inloop reduction.
1659   bool isInLoopReduction(PHINode *Phi) const {
1660     return InLoopReductions.contains(Phi);
1661   }
1662 
1663   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1664   /// with factor VF.  Return the cost of the instruction, including
1665   /// scalarization overhead if it's needed.
1666   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1667 
1668   /// Estimate cost of a call instruction CI if it were vectorized with factor
1669   /// VF. Return the cost of the instruction, including scalarization overhead
1670   /// if it's needed.
1671   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1672 
1673   /// Invalidates decisions already taken by the cost model.
1674   void invalidateCostModelingDecisions() {
1675     WideningDecisions.clear();
1676     CallWideningDecisions.clear();
1677     Uniforms.clear();
1678     Scalars.clear();
1679   }
1680 
1681   /// The vectorization cost is a combination of the cost itself and a boolean
1682   /// indicating whether any of the contributing operations will actually
1683   /// operate on vector values after type legalization in the backend. If this
1684   /// latter value is false, then all operations will be scalarized (i.e. no
1685   /// vectorization has actually taken place).
1686   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1687 
1688   /// Returns the expected execution cost. The unit of the cost does
1689   /// not matter because we use the 'cost' units to compare different
1690   /// vector widths. The cost that is returned is *not* normalized by
1691   /// the factor width. If \p Invalid is not nullptr, this function
1692   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1693   /// each instruction that has an Invalid cost for the given VF.
1694   VectorizationCostTy
1695   expectedCost(ElementCount VF,
1696                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1697 
1698   bool hasPredStores() const { return NumPredStores > 0; }
1699 
1700   /// Returns true if epilogue vectorization is considered profitable, and
1701   /// false otherwise.
1702   /// \p VF is the vectorization factor chosen for the original loop.
1703   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1704 
1705 private:
1706   unsigned NumPredStores = 0;
1707 
1708   /// \return An upper bound for the vectorization factors for both
1709   /// fixed and scalable vectorization, where the minimum-known number of
1710   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1711   /// disabled or unsupported, then the scalable part will be equal to
1712   /// ElementCount::getScalable(0).
1713   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1714                                            ElementCount UserVF,
1715                                            bool FoldTailByMasking);
1716 
1717   /// \return the maximized element count based on the targets vector
1718   /// registers and the loop trip-count, but limited to a maximum safe VF.
1719   /// This is a helper function of computeFeasibleMaxVF.
1720   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1721                                        unsigned SmallestType,
1722                                        unsigned WidestType,
1723                                        ElementCount MaxSafeVF,
1724                                        bool FoldTailByMasking);
1725 
1726   /// \return the maximum legal scalable VF, based on the safe max number
1727   /// of elements.
1728   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1729 
1730   /// Returns the execution time cost of an instruction for a given vector
1731   /// width. Vector width of one means scalar.
1732   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1733 
1734   /// The cost-computation logic from getInstructionCost which provides
1735   /// the vector type as an output parameter.
1736   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1737                                      Type *&VectorTy);
1738 
1739   /// Return the cost of instructions in an inloop reduction pattern, if I is
1740   /// part of that pattern.
1741   std::optional<InstructionCost>
1742   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1743                           TTI::TargetCostKind CostKind) const;
1744 
1745   /// Calculate vectorization cost of memory instruction \p I.
1746   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1747 
1748   /// The cost computation for scalarized memory instruction.
1749   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1750 
1751   /// The cost computation for interleaving group of memory instructions.
1752   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1753 
1754   /// The cost computation for Gather/Scatter instruction.
1755   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1756 
1757   /// The cost computation for widening instruction \p I with consecutive
1758   /// memory access.
1759   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1760 
1761   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1762   /// Load: scalar load + broadcast.
1763   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1764   /// element)
1765   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1766 
1767   /// Estimate the overhead of scalarizing an instruction. This is a
1768   /// convenience wrapper for the type-based getScalarizationOverhead API.
1769   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1770                                            TTI::TargetCostKind CostKind) const;
1771 
1772   /// Returns true if an artificially high cost for emulated masked memrefs
1773   /// should be used.
1774   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1775 
1776   /// Map of scalar integer values to the smallest bitwidth they can be legally
1777   /// represented as. The vector equivalents of these values should be truncated
1778   /// to this type.
1779   MapVector<Instruction *, uint64_t> MinBWs;
1780 
1781   /// A type representing the costs for instructions if they were to be
1782   /// scalarized rather than vectorized. The entries are Instruction-Cost
1783   /// pairs.
1784   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1785 
1786   /// A set containing all BasicBlocks that are known to present after
1787   /// vectorization as a predicated block.
1788   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1789       PredicatedBBsAfterVectorization;
1790 
1791   /// Records whether it is allowed to have the original scalar loop execute at
1792   /// least once. This may be needed as a fallback loop in case runtime
1793   /// aliasing/dependence checks fail, or to handle the tail/remainder
1794   /// iterations when the trip count is unknown or doesn't divide by the VF,
1795   /// or as a peel-loop to handle gaps in interleave-groups.
1796   /// Under optsize and when the trip count is very small we don't allow any
1797   /// iterations to execute in the scalar loop.
1798   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1799 
1800   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1801   bool CanFoldTailByMasking = false;
1802 
1803   /// A map holding scalar costs for different vectorization factors. The
1804   /// presence of a cost for an instruction in the mapping indicates that the
1805   /// instruction will be scalarized when vectorizing with the associated
1806   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1807   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1808 
1809   /// Holds the instructions known to be uniform after vectorization.
1810   /// The data is collected per VF.
1811   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1812 
1813   /// Holds the instructions known to be scalar after vectorization.
1814   /// The data is collected per VF.
1815   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1816 
1817   /// Holds the instructions (address computations) that are forced to be
1818   /// scalarized.
1819   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1820 
1821   /// PHINodes of the reductions that should be expanded in-loop.
1822   SmallPtrSet<PHINode *, 4> InLoopReductions;
1823 
1824   /// A Map of inloop reduction operations and their immediate chain operand.
1825   /// FIXME: This can be removed once reductions can be costed correctly in
1826   /// VPlan. This was added to allow quick lookup of the inloop operations.
1827   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1828 
1829   /// Returns the expected difference in cost from scalarizing the expression
1830   /// feeding a predicated instruction \p PredInst. The instructions to
1831   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1832   /// non-negative return value implies the expression will be scalarized.
1833   /// Currently, only single-use chains are considered for scalarization.
1834   InstructionCost computePredInstDiscount(Instruction *PredInst,
1835                                           ScalarCostsTy &ScalarCosts,
1836                                           ElementCount VF);
1837 
1838   /// Collect the instructions that are uniform after vectorization. An
1839   /// instruction is uniform if we represent it with a single scalar value in
1840   /// the vectorized loop corresponding to each vector iteration. Examples of
1841   /// uniform instructions include pointer operands of consecutive or
1842   /// interleaved memory accesses. Note that although uniformity implies an
1843   /// instruction will be scalar, the reverse is not true. In general, a
1844   /// scalarized instruction will be represented by VF scalar values in the
1845   /// vectorized loop, each corresponding to an iteration of the original
1846   /// scalar loop.
1847   void collectLoopUniforms(ElementCount VF);
1848 
1849   /// Collect the instructions that are scalar after vectorization. An
1850   /// instruction is scalar if it is known to be uniform or will be scalarized
1851   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1852   /// to the list if they are used by a load/store instruction that is marked as
1853   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1854   /// VF values in the vectorized loop, each corresponding to an iteration of
1855   /// the original scalar loop.
1856   void collectLoopScalars(ElementCount VF);
1857 
1858   /// Keeps cost model vectorization decision and cost for instructions.
1859   /// Right now it is used for memory instructions only.
1860   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1861                                 std::pair<InstWidening, InstructionCost>>;
1862 
1863   DecisionList WideningDecisions;
1864 
1865   using CallDecisionList =
1866       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1867 
1868   CallDecisionList CallWideningDecisions;
1869 
1870   /// Returns true if \p V is expected to be vectorized and it needs to be
1871   /// extracted.
1872   bool needsExtract(Value *V, ElementCount VF) const {
1873     Instruction *I = dyn_cast<Instruction>(V);
1874     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1875         TheLoop->isLoopInvariant(I))
1876       return false;
1877 
1878     // Assume we can vectorize V (and hence we need extraction) if the
1879     // scalars are not computed yet. This can happen, because it is called
1880     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1881     // the scalars are collected. That should be a safe assumption in most
1882     // cases, because we check if the operands have vectorizable types
1883     // beforehand in LoopVectorizationLegality.
1884     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1885   };
1886 
1887   /// Returns a range containing only operands needing to be extracted.
1888   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1889                                                    ElementCount VF) const {
1890     return SmallVector<Value *, 4>(make_filter_range(
1891         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1892   }
1893 
1894 public:
1895   /// The loop that we evaluate.
1896   Loop *TheLoop;
1897 
1898   /// Predicated scalar evolution analysis.
1899   PredicatedScalarEvolution &PSE;
1900 
1901   /// Loop Info analysis.
1902   LoopInfo *LI;
1903 
1904   /// Vectorization legality.
1905   LoopVectorizationLegality *Legal;
1906 
1907   /// Vector target information.
1908   const TargetTransformInfo &TTI;
1909 
1910   /// Target Library Info.
1911   const TargetLibraryInfo *TLI;
1912 
1913   /// Demanded bits analysis.
1914   DemandedBits *DB;
1915 
1916   /// Assumption cache.
1917   AssumptionCache *AC;
1918 
1919   /// Interface to emit optimization remarks.
1920   OptimizationRemarkEmitter *ORE;
1921 
1922   const Function *TheFunction;
1923 
1924   /// Loop Vectorize Hint.
1925   const LoopVectorizeHints *Hints;
1926 
1927   /// The interleave access information contains groups of interleaved accesses
1928   /// with the same stride and close to each other.
1929   InterleavedAccessInfo &InterleaveInfo;
1930 
1931   /// Values to ignore in the cost model.
1932   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1933 
1934   /// Values to ignore in the cost model when VF > 1.
1935   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1936 
1937   /// All element types found in the loop.
1938   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1939 };
1940 } // end namespace llvm
1941 
1942 namespace {
1943 /// Helper struct to manage generating runtime checks for vectorization.
1944 ///
1945 /// The runtime checks are created up-front in temporary blocks to allow better
1946 /// estimating the cost and un-linked from the existing IR. After deciding to
1947 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1948 /// temporary blocks are completely removed.
1949 class GeneratedRTChecks {
1950   /// Basic block which contains the generated SCEV checks, if any.
1951   BasicBlock *SCEVCheckBlock = nullptr;
1952 
1953   /// The value representing the result of the generated SCEV checks. If it is
1954   /// nullptr, either no SCEV checks have been generated or they have been used.
1955   Value *SCEVCheckCond = nullptr;
1956 
1957   /// Basic block which contains the generated memory runtime checks, if any.
1958   BasicBlock *MemCheckBlock = nullptr;
1959 
1960   /// The value representing the result of the generated memory runtime checks.
1961   /// If it is nullptr, either no memory runtime checks have been generated or
1962   /// they have been used.
1963   Value *MemRuntimeCheckCond = nullptr;
1964 
1965   DominatorTree *DT;
1966   LoopInfo *LI;
1967   TargetTransformInfo *TTI;
1968 
1969   SCEVExpander SCEVExp;
1970   SCEVExpander MemCheckExp;
1971 
1972   bool CostTooHigh = false;
1973   const bool AddBranchWeights;
1974 
1975 public:
1976   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1977                     TargetTransformInfo *TTI, const DataLayout &DL,
1978                     bool AddBranchWeights)
1979       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1980         MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1981 
1982   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1983   /// accurately estimate the cost of the runtime checks. The blocks are
1984   /// un-linked from the IR and is added back during vector code generation. If
1985   /// there is no vector code generation, the check blocks are removed
1986   /// completely.
1987   void Create(Loop *L, const LoopAccessInfo &LAI,
1988               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1989 
1990     // Hard cutoff to limit compile-time increase in case a very large number of
1991     // runtime checks needs to be generated.
1992     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1993     // profile info.
1994     CostTooHigh =
1995         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1996     if (CostTooHigh)
1997       return;
1998 
1999     BasicBlock *LoopHeader = L->getHeader();
2000     BasicBlock *Preheader = L->getLoopPreheader();
2001 
2002     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2003     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2004     // may be used by SCEVExpander. The blocks will be un-linked from their
2005     // predecessors and removed from LI & DT at the end of the function.
2006     if (!UnionPred.isAlwaysTrue()) {
2007       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2008                                   nullptr, "vector.scevcheck");
2009 
2010       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2011           &UnionPred, SCEVCheckBlock->getTerminator());
2012     }
2013 
2014     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2015     if (RtPtrChecking.Need) {
2016       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2017       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2018                                  "vector.memcheck");
2019 
2020       auto DiffChecks = RtPtrChecking.getDiffChecks();
2021       if (DiffChecks) {
2022         Value *RuntimeVF = nullptr;
2023         MemRuntimeCheckCond = addDiffRuntimeChecks(
2024             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
2025             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
2026               if (!RuntimeVF)
2027                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2028               return RuntimeVF;
2029             },
2030             IC);
2031       } else {
2032         MemRuntimeCheckCond = addRuntimeChecks(
2033             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
2034             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
2035       }
2036       assert(MemRuntimeCheckCond &&
2037              "no RT checks generated although RtPtrChecking "
2038              "claimed checks are required");
2039     }
2040 
2041     if (!MemCheckBlock && !SCEVCheckBlock)
2042       return;
2043 
2044     // Unhook the temporary block with the checks, update various places
2045     // accordingly.
2046     if (SCEVCheckBlock)
2047       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2048     if (MemCheckBlock)
2049       MemCheckBlock->replaceAllUsesWith(Preheader);
2050 
2051     if (SCEVCheckBlock) {
2052       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2053       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2054       Preheader->getTerminator()->eraseFromParent();
2055     }
2056     if (MemCheckBlock) {
2057       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2058       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2059       Preheader->getTerminator()->eraseFromParent();
2060     }
2061 
2062     DT->changeImmediateDominator(LoopHeader, Preheader);
2063     if (MemCheckBlock) {
2064       DT->eraseNode(MemCheckBlock);
2065       LI->removeBlock(MemCheckBlock);
2066     }
2067     if (SCEVCheckBlock) {
2068       DT->eraseNode(SCEVCheckBlock);
2069       LI->removeBlock(SCEVCheckBlock);
2070     }
2071   }
2072 
2073   InstructionCost getCost() {
2074     if (SCEVCheckBlock || MemCheckBlock)
2075       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2076 
2077     if (CostTooHigh) {
2078       InstructionCost Cost;
2079       Cost.setInvalid();
2080       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
2081       return Cost;
2082     }
2083 
2084     InstructionCost RTCheckCost = 0;
2085     if (SCEVCheckBlock)
2086       for (Instruction &I : *SCEVCheckBlock) {
2087         if (SCEVCheckBlock->getTerminator() == &I)
2088           continue;
2089         InstructionCost C =
2090             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2091         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2092         RTCheckCost += C;
2093       }
2094     if (MemCheckBlock)
2095       for (Instruction &I : *MemCheckBlock) {
2096         if (MemCheckBlock->getTerminator() == &I)
2097           continue;
2098         InstructionCost C =
2099             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2100         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2101         RTCheckCost += C;
2102       }
2103 
2104     if (SCEVCheckBlock || MemCheckBlock)
2105       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2106                         << "\n");
2107 
2108     return RTCheckCost;
2109   }
2110 
2111   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2112   /// unused.
2113   ~GeneratedRTChecks() {
2114     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2115     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2116     if (!SCEVCheckCond)
2117       SCEVCleaner.markResultUsed();
2118 
2119     if (!MemRuntimeCheckCond)
2120       MemCheckCleaner.markResultUsed();
2121 
2122     if (MemRuntimeCheckCond) {
2123       auto &SE = *MemCheckExp.getSE();
2124       // Memory runtime check generation creates compares that use expanded
2125       // values. Remove them before running the SCEVExpanderCleaners.
2126       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2127         if (MemCheckExp.isInsertedInstruction(&I))
2128           continue;
2129         SE.forgetValue(&I);
2130         I.eraseFromParent();
2131       }
2132     }
2133     MemCheckCleaner.cleanup();
2134     SCEVCleaner.cleanup();
2135 
2136     if (SCEVCheckCond)
2137       SCEVCheckBlock->eraseFromParent();
2138     if (MemRuntimeCheckCond)
2139       MemCheckBlock->eraseFromParent();
2140   }
2141 
2142   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2143   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2144   /// depending on the generated condition.
2145   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2146                              BasicBlock *LoopVectorPreHeader,
2147                              BasicBlock *LoopExitBlock) {
2148     if (!SCEVCheckCond)
2149       return nullptr;
2150 
2151     Value *Cond = SCEVCheckCond;
2152     // Mark the check as used, to prevent it from being removed during cleanup.
2153     SCEVCheckCond = nullptr;
2154     if (auto *C = dyn_cast<ConstantInt>(Cond))
2155       if (C->isZero())
2156         return nullptr;
2157 
2158     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2159 
2160     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2161     // Create new preheader for vector loop.
2162     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2163       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2164 
2165     SCEVCheckBlock->getTerminator()->eraseFromParent();
2166     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2167     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2168                                                 SCEVCheckBlock);
2169 
2170     DT->addNewBlock(SCEVCheckBlock, Pred);
2171     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2172 
2173     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2174     if (AddBranchWeights)
2175       setBranchWeights(BI, SCEVCheckBypassWeights);
2176     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2177     return SCEVCheckBlock;
2178   }
2179 
2180   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2181   /// the branches to branch to the vector preheader or \p Bypass, depending on
2182   /// the generated condition.
2183   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2184                                    BasicBlock *LoopVectorPreHeader) {
2185     // Check if we generated code that checks in runtime if arrays overlap.
2186     if (!MemRuntimeCheckCond)
2187       return nullptr;
2188 
2189     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2190     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2191                                                 MemCheckBlock);
2192 
2193     DT->addNewBlock(MemCheckBlock, Pred);
2194     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2195     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2196 
2197     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2198       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2199 
2200     BranchInst &BI =
2201         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2202     if (AddBranchWeights) {
2203       setBranchWeights(BI, MemCheckBypassWeights);
2204     }
2205     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2206     MemCheckBlock->getTerminator()->setDebugLoc(
2207         Pred->getTerminator()->getDebugLoc());
2208 
2209     // Mark the check as used, to prevent it from being removed during cleanup.
2210     MemRuntimeCheckCond = nullptr;
2211     return MemCheckBlock;
2212   }
2213 };
2214 } // namespace
2215 
2216 static bool useActiveLaneMask(TailFoldingStyle Style) {
2217   return Style == TailFoldingStyle::Data ||
2218          Style == TailFoldingStyle::DataAndControlFlow ||
2219          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2220 }
2221 
2222 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2223   return Style == TailFoldingStyle::DataAndControlFlow ||
2224          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2225 }
2226 
2227 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2228 // vectorization. The loop needs to be annotated with #pragma omp simd
2229 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2230 // vector length information is not provided, vectorization is not considered
2231 // explicit. Interleave hints are not allowed either. These limitations will be
2232 // relaxed in the future.
2233 // Please, note that we are currently forced to abuse the pragma 'clang
2234 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2235 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2236 // provides *explicit vectorization hints* (LV can bypass legal checks and
2237 // assume that vectorization is legal). However, both hints are implemented
2238 // using the same metadata (llvm.loop.vectorize, processed by
2239 // LoopVectorizeHints). This will be fixed in the future when the native IR
2240 // representation for pragma 'omp simd' is introduced.
2241 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2242                                    OptimizationRemarkEmitter *ORE) {
2243   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2244   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2245 
2246   // Only outer loops with an explicit vectorization hint are supported.
2247   // Unannotated outer loops are ignored.
2248   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2249     return false;
2250 
2251   Function *Fn = OuterLp->getHeader()->getParent();
2252   if (!Hints.allowVectorization(Fn, OuterLp,
2253                                 true /*VectorizeOnlyWhenForced*/)) {
2254     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2255     return false;
2256   }
2257 
2258   if (Hints.getInterleave() > 1) {
2259     // TODO: Interleave support is future work.
2260     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2261                          "outer loops.\n");
2262     Hints.emitRemarkWithHints();
2263     return false;
2264   }
2265 
2266   return true;
2267 }
2268 
2269 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2270                                   OptimizationRemarkEmitter *ORE,
2271                                   SmallVectorImpl<Loop *> &V) {
2272   // Collect inner loops and outer loops without irreducible control flow. For
2273   // now, only collect outer loops that have explicit vectorization hints. If we
2274   // are stress testing the VPlan H-CFG construction, we collect the outermost
2275   // loop of every loop nest.
2276   if (L.isInnermost() || VPlanBuildStressTest ||
2277       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2278     LoopBlocksRPO RPOT(&L);
2279     RPOT.perform(LI);
2280     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2281       V.push_back(&L);
2282       // TODO: Collect inner loops inside marked outer loops in case
2283       // vectorization fails for the outer loop. Do not invoke
2284       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2285       // already known to be reducible. We can use an inherited attribute for
2286       // that.
2287       return;
2288     }
2289   }
2290   for (Loop *InnerL : L)
2291     collectSupportedLoops(*InnerL, LI, ORE, V);
2292 }
2293 
2294 //===----------------------------------------------------------------------===//
2295 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2296 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2297 //===----------------------------------------------------------------------===//
2298 
2299 /// Compute the transformed value of Index at offset StartValue using step
2300 /// StepValue.
2301 /// For integer induction, returns StartValue + Index * StepValue.
2302 /// For pointer induction, returns StartValue[Index * StepValue].
2303 /// FIXME: The newly created binary instructions should contain nsw/nuw
2304 /// flags, which can be found from the original scalar operations.
2305 static Value *
2306 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2307                      Value *Step,
2308                      InductionDescriptor::InductionKind InductionKind,
2309                      const BinaryOperator *InductionBinOp) {
2310   Type *StepTy = Step->getType();
2311   Value *CastedIndex = StepTy->isIntegerTy()
2312                            ? B.CreateSExtOrTrunc(Index, StepTy)
2313                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2314   if (CastedIndex != Index) {
2315     CastedIndex->setName(CastedIndex->getName() + ".cast");
2316     Index = CastedIndex;
2317   }
2318 
2319   // Note: the IR at this point is broken. We cannot use SE to create any new
2320   // SCEV and then expand it, hoping that SCEV's simplification will give us
2321   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2322   // lead to various SCEV crashes. So all we can do is to use builder and rely
2323   // on InstCombine for future simplifications. Here we handle some trivial
2324   // cases only.
2325   auto CreateAdd = [&B](Value *X, Value *Y) {
2326     assert(X->getType() == Y->getType() && "Types don't match!");
2327     if (auto *CX = dyn_cast<ConstantInt>(X))
2328       if (CX->isZero())
2329         return Y;
2330     if (auto *CY = dyn_cast<ConstantInt>(Y))
2331       if (CY->isZero())
2332         return X;
2333     return B.CreateAdd(X, Y);
2334   };
2335 
2336   // We allow X to be a vector type, in which case Y will potentially be
2337   // splatted into a vector with the same element count.
2338   auto CreateMul = [&B](Value *X, Value *Y) {
2339     assert(X->getType()->getScalarType() == Y->getType() &&
2340            "Types don't match!");
2341     if (auto *CX = dyn_cast<ConstantInt>(X))
2342       if (CX->isOne())
2343         return Y;
2344     if (auto *CY = dyn_cast<ConstantInt>(Y))
2345       if (CY->isOne())
2346         return X;
2347     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2348     if (XVTy && !isa<VectorType>(Y->getType()))
2349       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2350     return B.CreateMul(X, Y);
2351   };
2352 
2353   switch (InductionKind) {
2354   case InductionDescriptor::IK_IntInduction: {
2355     assert(!isa<VectorType>(Index->getType()) &&
2356            "Vector indices not supported for integer inductions yet");
2357     assert(Index->getType() == StartValue->getType() &&
2358            "Index type does not match StartValue type");
2359     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2360       return B.CreateSub(StartValue, Index);
2361     auto *Offset = CreateMul(Index, Step);
2362     return CreateAdd(StartValue, Offset);
2363   }
2364   case InductionDescriptor::IK_PtrInduction: {
2365     return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step));
2366   }
2367   case InductionDescriptor::IK_FpInduction: {
2368     assert(!isa<VectorType>(Index->getType()) &&
2369            "Vector indices not supported for FP inductions yet");
2370     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2371     assert(InductionBinOp &&
2372            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2373             InductionBinOp->getOpcode() == Instruction::FSub) &&
2374            "Original bin op should be defined for FP induction");
2375 
2376     Value *MulExp = B.CreateFMul(Step, Index);
2377     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2378                          "induction");
2379   }
2380   case InductionDescriptor::IK_NoInduction:
2381     return nullptr;
2382   }
2383   llvm_unreachable("invalid enum");
2384 }
2385 
2386 std::optional<unsigned> getMaxVScale(const Function &F,
2387                                      const TargetTransformInfo &TTI) {
2388   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2389     return MaxVScale;
2390 
2391   if (F.hasFnAttribute(Attribute::VScaleRange))
2392     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2393 
2394   return std::nullopt;
2395 }
2396 
2397 /// For the given VF and UF and maximum trip count computed for the loop, return
2398 /// whether the induction variable might overflow in the vectorized loop. If not,
2399 /// then we know a runtime overflow check always evaluates to false and can be
2400 /// removed.
2401 static bool isIndvarOverflowCheckKnownFalse(
2402     const LoopVectorizationCostModel *Cost,
2403     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2404   // Always be conservative if we don't know the exact unroll factor.
2405   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2406 
2407   Type *IdxTy = Cost->Legal->getWidestInductionType();
2408   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2409 
2410   // We know the runtime overflow check is known false iff the (max) trip-count
2411   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2412   // the vector loop induction variable.
2413   if (unsigned TC =
2414           Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2415     uint64_t MaxVF = VF.getKnownMinValue();
2416     if (VF.isScalable()) {
2417       std::optional<unsigned> MaxVScale =
2418           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2419       if (!MaxVScale)
2420         return false;
2421       MaxVF *= *MaxVScale;
2422     }
2423 
2424     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2425   }
2426 
2427   return false;
2428 }
2429 
2430 // Return whether we allow using masked interleave-groups (for dealing with
2431 // strided loads/stores that reside in predicated blocks, or for dealing
2432 // with gaps).
2433 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2434   // If an override option has been passed in for interleaved accesses, use it.
2435   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2436     return EnableMaskedInterleavedMemAccesses;
2437 
2438   return TTI.enableMaskedInterleavedAccessVectorization();
2439 }
2440 
2441 // Try to vectorize the interleave group that \p Instr belongs to.
2442 //
2443 // E.g. Translate following interleaved load group (factor = 3):
2444 //   for (i = 0; i < N; i+=3) {
2445 //     R = Pic[i];             // Member of index 0
2446 //     G = Pic[i+1];           // Member of index 1
2447 //     B = Pic[i+2];           // Member of index 2
2448 //     ... // do something to R, G, B
2449 //   }
2450 // To:
2451 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2452 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2453 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2454 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2455 //
2456 // Or translate following interleaved store group (factor = 3):
2457 //   for (i = 0; i < N; i+=3) {
2458 //     ... do something to R, G, B
2459 //     Pic[i]   = R;           // Member of index 0
2460 //     Pic[i+1] = G;           // Member of index 1
2461 //     Pic[i+2] = B;           // Member of index 2
2462 //   }
2463 // To:
2464 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2465 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2466 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2467 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2468 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2469 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2470     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2471     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2472     VPValue *BlockInMask, bool NeedsMaskForGaps) {
2473   Instruction *Instr = Group->getInsertPos();
2474   const DataLayout &DL = Instr->getModule()->getDataLayout();
2475 
2476   // Prepare for the vector type of the interleaved load/store.
2477   Type *ScalarTy = getLoadStoreType(Instr);
2478   unsigned InterleaveFactor = Group->getFactor();
2479   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2480 
2481   // Prepare for the new pointers.
2482   SmallVector<Value *, 2> AddrParts;
2483   unsigned Index = Group->getIndex(Instr);
2484 
2485   // TODO: extend the masked interleaved-group support to reversed access.
2486   assert((!BlockInMask || !Group->isReverse()) &&
2487          "Reversed masked interleave-group not supported.");
2488 
2489   Value *Idx;
2490   // If the group is reverse, adjust the index to refer to the last vector lane
2491   // instead of the first. We adjust the index from the first vector lane,
2492   // rather than directly getting the pointer for lane VF - 1, because the
2493   // pointer operand of the interleaved access is supposed to be uniform. For
2494   // uniform instructions, we're only required to generate a value for the
2495   // first vector lane in each unroll iteration.
2496   if (Group->isReverse()) {
2497     Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2498     Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2499     Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
2500     Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
2501     Idx = Builder.CreateNeg(Idx);
2502   } else
2503     Idx = Builder.getInt32(-Index);
2504 
2505   for (unsigned Part = 0; Part < UF; Part++) {
2506     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2507     if (auto *I = dyn_cast<Instruction>(AddrPart))
2508       State.setDebugLocFrom(I->getDebugLoc());
2509 
2510     // Notice current instruction could be any index. Need to adjust the address
2511     // to the member of index 0.
2512     //
2513     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2514     //       b = A[i];       // Member of index 0
2515     // Current pointer is pointed to A[i+1], adjust it to A[i].
2516     //
2517     // E.g.  A[i+1] = a;     // Member of index 1
2518     //       A[i]   = b;     // Member of index 0
2519     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2520     // Current pointer is pointed to A[i+2], adjust it to A[i].
2521 
2522     bool InBounds = false;
2523     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2524       InBounds = gep->isInBounds();
2525     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2526     AddrParts.push_back(AddrPart);
2527   }
2528 
2529   State.setDebugLocFrom(Instr->getDebugLoc());
2530   Value *PoisonVec = PoisonValue::get(VecTy);
2531 
2532   auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2533                              unsigned Part, Value *MaskForGaps) -> Value * {
2534     if (VF.isScalable()) {
2535       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2536       assert(InterleaveFactor == 2 &&
2537              "Unsupported deinterleave factor for scalable vectors");
2538       auto *BlockInMaskPart = State.get(BlockInMask, Part);
2539       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2540       auto *MaskTy =
2541           VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
2542       return Builder.CreateIntrinsic(
2543           MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2544           /*FMFSource=*/nullptr, "interleaved.mask");
2545     }
2546 
2547     if (!BlockInMask)
2548       return MaskForGaps;
2549 
2550     Value *BlockInMaskPart = State.get(BlockInMask, Part);
2551     Value *ShuffledMask = Builder.CreateShuffleVector(
2552         BlockInMaskPart,
2553         createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2554         "interleaved.mask");
2555     return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2556                                              MaskForGaps)
2557                        : ShuffledMask;
2558   };
2559 
2560   // Vectorize the interleaved load group.
2561   if (isa<LoadInst>(Instr)) {
2562     Value *MaskForGaps = nullptr;
2563     if (NeedsMaskForGaps) {
2564       MaskForGaps =
2565           createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2566       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2567     }
2568 
2569     // For each unroll part, create a wide load for the group.
2570     SmallVector<Value *, 2> NewLoads;
2571     for (unsigned Part = 0; Part < UF; Part++) {
2572       Instruction *NewLoad;
2573       if (BlockInMask || MaskForGaps) {
2574         assert(useMaskedInterleavedAccesses(*TTI) &&
2575                "masked interleaved groups are not allowed.");
2576         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2577         NewLoad =
2578             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2579                                      GroupMask, PoisonVec, "wide.masked.vec");
2580       }
2581       else
2582         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2583                                             Group->getAlign(), "wide.vec");
2584       Group->addMetadata(NewLoad);
2585       NewLoads.push_back(NewLoad);
2586     }
2587 
2588     if (VecTy->isScalableTy()) {
2589       assert(InterleaveFactor == 2 &&
2590              "Unsupported deinterleave factor for scalable vectors");
2591 
2592       for (unsigned Part = 0; Part < UF; ++Part) {
2593         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2594         // so must use intrinsics to deinterleave.
2595         Value *DI = Builder.CreateIntrinsic(
2596             Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2597             /*FMFSource=*/nullptr, "strided.vec");
2598         unsigned J = 0;
2599         for (unsigned I = 0; I < InterleaveFactor; ++I) {
2600           Instruction *Member = Group->getMember(I);
2601 
2602           if (!Member)
2603             continue;
2604 
2605           Value *StridedVec = Builder.CreateExtractValue(DI, I);
2606           // If this member has different type, cast the result type.
2607           if (Member->getType() != ScalarTy) {
2608             VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2609             StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2610           }
2611 
2612           if (Group->isReverse())
2613             StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2614 
2615           State.set(VPDefs[J], StridedVec, Part);
2616           ++J;
2617         }
2618       }
2619 
2620       return;
2621     }
2622 
2623     // For each member in the group, shuffle out the appropriate data from the
2624     // wide loads.
2625     unsigned J = 0;
2626     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2627       Instruction *Member = Group->getMember(I);
2628 
2629       // Skip the gaps in the group.
2630       if (!Member)
2631         continue;
2632 
2633       auto StrideMask =
2634           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2635       for (unsigned Part = 0; Part < UF; Part++) {
2636         Value *StridedVec = Builder.CreateShuffleVector(
2637             NewLoads[Part], StrideMask, "strided.vec");
2638 
2639         // If this member has different type, cast the result type.
2640         if (Member->getType() != ScalarTy) {
2641           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2642           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2643           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2644         }
2645 
2646         if (Group->isReverse())
2647           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2648 
2649         State.set(VPDefs[J], StridedVec, Part);
2650       }
2651       ++J;
2652     }
2653     return;
2654   }
2655 
2656   // The sub vector type for current instruction.
2657   auto *SubVT = VectorType::get(ScalarTy, VF);
2658 
2659   // Vectorize the interleaved store group.
2660   Value *MaskForGaps =
2661       createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2662   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2663          "masked interleaved groups are not allowed.");
2664   assert((!MaskForGaps || !VF.isScalable()) &&
2665          "masking gaps for scalable vectors is not yet supported.");
2666   for (unsigned Part = 0; Part < UF; Part++) {
2667     // Collect the stored vector from each member.
2668     SmallVector<Value *, 4> StoredVecs;
2669     unsigned StoredIdx = 0;
2670     for (unsigned i = 0; i < InterleaveFactor; i++) {
2671       assert((Group->getMember(i) || MaskForGaps) &&
2672              "Fail to get a member from an interleaved store group");
2673       Instruction *Member = Group->getMember(i);
2674 
2675       // Skip the gaps in the group.
2676       if (!Member) {
2677         Value *Undef = PoisonValue::get(SubVT);
2678         StoredVecs.push_back(Undef);
2679         continue;
2680       }
2681 
2682       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2683       ++StoredIdx;
2684 
2685       if (Group->isReverse())
2686         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2687 
2688       // If this member has different type, cast it to a unified type.
2689 
2690       if (StoredVec->getType() != SubVT)
2691         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2692 
2693       StoredVecs.push_back(StoredVec);
2694     }
2695 
2696     // Interleave all the smaller vectors into one wider vector.
2697     Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2698     Instruction *NewStoreInstr;
2699     if (BlockInMask || MaskForGaps) {
2700       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2701       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2702                                                 Group->getAlign(), GroupMask);
2703     } else
2704       NewStoreInstr =
2705           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2706 
2707     Group->addMetadata(NewStoreInstr);
2708   }
2709 }
2710 
2711 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2712                                                VPReplicateRecipe *RepRecipe,
2713                                                const VPIteration &Instance,
2714                                                VPTransformState &State) {
2715   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2716 
2717   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2718   // the first lane and part.
2719   if (isa<NoAliasScopeDeclInst>(Instr))
2720     if (!Instance.isFirstIteration())
2721       return;
2722 
2723   // Does this instruction return a value ?
2724   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2725 
2726   Instruction *Cloned = Instr->clone();
2727   if (!IsVoidRetTy) {
2728     Cloned->setName(Instr->getName() + ".cloned");
2729 #if !defined(NDEBUG)
2730     // Verify that VPlan type inference results agree with the type of the
2731     // generated values.
2732     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2733            "inferred type and type from generated instructions do not match");
2734 #endif
2735   }
2736 
2737   RepRecipe->setFlags(Cloned);
2738 
2739   if (auto DL = Instr->getDebugLoc())
2740     State.setDebugLocFrom(DL);
2741 
2742   // Replace the operands of the cloned instructions with their scalar
2743   // equivalents in the new loop.
2744   for (const auto &I : enumerate(RepRecipe->operands())) {
2745     auto InputInstance = Instance;
2746     VPValue *Operand = I.value();
2747     if (vputils::isUniformAfterVectorization(Operand))
2748       InputInstance.Lane = VPLane::getFirstLane();
2749     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2750   }
2751   State.addNewMetadata(Cloned, Instr);
2752 
2753   // Place the cloned scalar in the new loop.
2754   State.Builder.Insert(Cloned);
2755 
2756   State.set(RepRecipe, Cloned, Instance);
2757 
2758   // If we just cloned a new assumption, add it the assumption cache.
2759   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2760     AC->registerAssumption(II);
2761 
2762   // End if-block.
2763   bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2764   if (IfPredicateInstr)
2765     PredicatedInstructions.push_back(Cloned);
2766 }
2767 
2768 Value *
2769 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2770   if (VectorTripCount)
2771     return VectorTripCount;
2772 
2773   Value *TC = getTripCount();
2774   IRBuilder<> Builder(InsertBlock->getTerminator());
2775 
2776   Type *Ty = TC->getType();
2777   // This is where we can make the step a runtime constant.
2778   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2779 
2780   // If the tail is to be folded by masking, round the number of iterations N
2781   // up to a multiple of Step instead of rounding down. This is done by first
2782   // adding Step-1 and then rounding down. Note that it's ok if this addition
2783   // overflows: the vector induction variable will eventually wrap to zero given
2784   // that it starts at zero and its Step is a power of two; the loop will then
2785   // exit, with the last early-exit vector comparison also producing all-true.
2786   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2787   // is accounted for in emitIterationCountCheck that adds an overflow check.
2788   if (Cost->foldTailByMasking()) {
2789     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2790            "VF*UF must be a power of 2 when folding tail by masking");
2791     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2792     TC = Builder.CreateAdd(
2793         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2794   }
2795 
2796   // Now we need to generate the expression for the part of the loop that the
2797   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2798   // iterations are not required for correctness, or N - Step, otherwise. Step
2799   // is equal to the vectorization factor (number of SIMD elements) times the
2800   // unroll factor (number of SIMD instructions).
2801   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2802 
2803   // There are cases where we *must* run at least one iteration in the remainder
2804   // loop.  See the cost model for when this can happen.  If the step evenly
2805   // divides the trip count, we set the remainder to be equal to the step. If
2806   // the step does not evenly divide the trip count, no adjustment is necessary
2807   // since there will already be scalar iterations. Note that the minimum
2808   // iterations check ensures that N >= Step.
2809   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2810     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2811     R = Builder.CreateSelect(IsZero, Step, R);
2812   }
2813 
2814   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2815 
2816   return VectorTripCount;
2817 }
2818 
2819 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2820                                                    const DataLayout &DL) {
2821   // Verify that V is a vector type with same number of elements as DstVTy.
2822   auto *DstFVTy = cast<VectorType>(DstVTy);
2823   auto VF = DstFVTy->getElementCount();
2824   auto *SrcVecTy = cast<VectorType>(V->getType());
2825   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2826   Type *SrcElemTy = SrcVecTy->getElementType();
2827   Type *DstElemTy = DstFVTy->getElementType();
2828   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2829          "Vector elements must have same size");
2830 
2831   // Do a direct cast if element types are castable.
2832   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2833     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2834   }
2835   // V cannot be directly casted to desired vector type.
2836   // May happen when V is a floating point vector but DstVTy is a vector of
2837   // pointers or vice-versa. Handle this using a two-step bitcast using an
2838   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2839   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2840          "Only one type should be a pointer type");
2841   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2842          "Only one type should be a floating point type");
2843   Type *IntTy =
2844       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2845   auto *VecIntTy = VectorType::get(IntTy, VF);
2846   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2847   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2848 }
2849 
2850 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2851   Value *Count = getTripCount();
2852   // Reuse existing vector loop preheader for TC checks.
2853   // Note that new preheader block is generated for vector loop.
2854   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2855   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2856 
2857   // Generate code to check if the loop's trip count is less than VF * UF, or
2858   // equal to it in case a scalar epilogue is required; this implies that the
2859   // vector trip count is zero. This check also covers the case where adding one
2860   // to the backedge-taken count overflowed leading to an incorrect trip count
2861   // of zero. In this case we will also jump to the scalar loop.
2862   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2863                                                        : ICmpInst::ICMP_ULT;
2864 
2865   // If tail is to be folded, vector loop takes care of all iterations.
2866   Type *CountTy = Count->getType();
2867   Value *CheckMinIters = Builder.getFalse();
2868   auto CreateStep = [&]() -> Value * {
2869     // Create step with max(MinProTripCount, UF * VF).
2870     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2871       return createStepForVF(Builder, CountTy, VF, UF);
2872 
2873     Value *MinProfTC =
2874         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2875     if (!VF.isScalable())
2876       return MinProfTC;
2877     return Builder.CreateBinaryIntrinsic(
2878         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2879   };
2880 
2881   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2882   if (Style == TailFoldingStyle::None)
2883     CheckMinIters =
2884         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2885   else if (VF.isScalable() &&
2886            !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2887            Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2888     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2889     // an overflow to zero when updating induction variables and so an
2890     // additional overflow check is required before entering the vector loop.
2891 
2892     // Get the maximum unsigned value for the type.
2893     Value *MaxUIntTripCount =
2894         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2895     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2896 
2897     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2898     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2899   }
2900 
2901   // Create new preheader for vector loop.
2902   LoopVectorPreHeader =
2903       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2904                  "vector.ph");
2905 
2906   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2907                                DT->getNode(Bypass)->getIDom()) &&
2908          "TC check is expected to dominate Bypass");
2909 
2910   // Update dominator for Bypass & LoopExit (if needed).
2911   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2912   if (!Cost->requiresScalarEpilogue(VF.isVector()))
2913     // If there is an epilogue which must run, there's no edge from the
2914     // middle block to exit blocks  and thus no need to update the immediate
2915     // dominator of the exit blocks.
2916     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2917 
2918   BranchInst &BI =
2919       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2920   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2921     setBranchWeights(BI, MinItersBypassWeights);
2922   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2923   LoopBypassBlocks.push_back(TCCheckBlock);
2924 }
2925 
2926 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2927   BasicBlock *const SCEVCheckBlock =
2928       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2929   if (!SCEVCheckBlock)
2930     return nullptr;
2931 
2932   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2933            (OptForSizeBasedOnProfile &&
2934             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2935          "Cannot SCEV check stride or overflow when optimizing for size");
2936 
2937 
2938   // Update dominator only if this is first RT check.
2939   if (LoopBypassBlocks.empty()) {
2940     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2941     if (!Cost->requiresScalarEpilogue(VF.isVector()))
2942       // If there is an epilogue which must run, there's no edge from the
2943       // middle block to exit blocks  and thus no need to update the immediate
2944       // dominator of the exit blocks.
2945       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2946   }
2947 
2948   LoopBypassBlocks.push_back(SCEVCheckBlock);
2949   AddedSafetyChecks = true;
2950   return SCEVCheckBlock;
2951 }
2952 
2953 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2954   // VPlan-native path does not do any analysis for runtime checks currently.
2955   if (EnableVPlanNativePath)
2956     return nullptr;
2957 
2958   BasicBlock *const MemCheckBlock =
2959       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2960 
2961   // Check if we generated code that checks in runtime if arrays overlap. We put
2962   // the checks into a separate block to make the more common case of few
2963   // elements faster.
2964   if (!MemCheckBlock)
2965     return nullptr;
2966 
2967   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2968     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2969            "Cannot emit memory checks when optimizing for size, unless forced "
2970            "to vectorize.");
2971     ORE->emit([&]() {
2972       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2973                                         OrigLoop->getStartLoc(),
2974                                         OrigLoop->getHeader())
2975              << "Code-size may be reduced by not forcing "
2976                 "vectorization, or by source-code modifications "
2977                 "eliminating the need for runtime checks "
2978                 "(e.g., adding 'restrict').";
2979     });
2980   }
2981 
2982   LoopBypassBlocks.push_back(MemCheckBlock);
2983 
2984   AddedSafetyChecks = true;
2985 
2986   return MemCheckBlock;
2987 }
2988 
2989 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2990   LoopScalarBody = OrigLoop->getHeader();
2991   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2992   assert(LoopVectorPreHeader && "Invalid loop structure");
2993   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2994   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2995          "multiple exit loop without required epilogue?");
2996 
2997   LoopMiddleBlock =
2998       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2999                  LI, nullptr, Twine(Prefix) + "middle.block");
3000   LoopScalarPreHeader =
3001       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3002                  nullptr, Twine(Prefix) + "scalar.ph");
3003 
3004   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3005 
3006   // Set up the middle block terminator.  Two cases:
3007   // 1) If we know that we must execute the scalar epilogue, emit an
3008   //    unconditional branch.
3009   // 2) Otherwise, we must have a single unique exit block (due to how we
3010   //    implement the multiple exit case).  In this case, set up a conditional
3011   //    branch from the middle block to the loop scalar preheader, and the
3012   //    exit block.  completeLoopSkeleton will update the condition to use an
3013   //    iteration check, if required to decide whether to execute the remainder.
3014   BranchInst *BrInst =
3015       Cost->requiresScalarEpilogue(VF.isVector())
3016           ? BranchInst::Create(LoopScalarPreHeader)
3017           : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3018                                Builder.getTrue());
3019   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3020   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3021 
3022   // Update dominator for loop exit. During skeleton creation, only the vector
3023   // pre-header and the middle block are created. The vector loop is entirely
3024   // created during VPlan exection.
3025   if (!Cost->requiresScalarEpilogue(VF.isVector()))
3026     // If there is an epilogue which must run, there's no edge from the
3027     // middle block to exit blocks  and thus no need to update the immediate
3028     // dominator of the exit blocks.
3029     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3030 }
3031 
3032 PHINode *InnerLoopVectorizer::createInductionResumeValue(
3033     PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3034     ArrayRef<BasicBlock *> BypassBlocks,
3035     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3036   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3037   assert(VectorTripCount && "Expected valid arguments");
3038 
3039   Instruction *OldInduction = Legal->getPrimaryInduction();
3040   Value *&EndValue = IVEndValues[OrigPhi];
3041   Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3042   if (OrigPhi == OldInduction) {
3043     // We know what the end value is.
3044     EndValue = VectorTripCount;
3045   } else {
3046     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3047 
3048     // Fast-math-flags propagate from the original induction instruction.
3049     if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3050       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3051 
3052     EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
3053                                     Step, II.getKind(), II.getInductionBinOp());
3054     EndValue->setName("ind.end");
3055 
3056     // Compute the end value for the additional bypass (if applicable).
3057     if (AdditionalBypass.first) {
3058       B.SetInsertPoint(AdditionalBypass.first,
3059                        AdditionalBypass.first->getFirstInsertionPt());
3060       EndValueFromAdditionalBypass =
3061           emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3062                                Step, II.getKind(), II.getInductionBinOp());
3063       EndValueFromAdditionalBypass->setName("ind.end");
3064     }
3065   }
3066 
3067   // Create phi nodes to merge from the  backedge-taken check block.
3068   PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3069                                          LoopScalarPreHeader->getTerminator());
3070   // Copy original phi DL over to the new one.
3071   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3072 
3073   // The new PHI merges the original incoming value, in case of a bypass,
3074   // or the value at the end of the vectorized loop.
3075   BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3076 
3077   // Fix the scalar body counter (PHI node).
3078   // The old induction's phi node in the scalar body needs the truncated
3079   // value.
3080   for (BasicBlock *BB : BypassBlocks)
3081     BCResumeVal->addIncoming(II.getStartValue(), BB);
3082 
3083   if (AdditionalBypass.first)
3084     BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3085                                           EndValueFromAdditionalBypass);
3086   return BCResumeVal;
3087 }
3088 
3089 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3090 /// expansion results.
3091 static Value *getExpandedStep(const InductionDescriptor &ID,
3092                               const SCEV2ValueTy &ExpandedSCEVs) {
3093   const SCEV *Step = ID.getStep();
3094   if (auto *C = dyn_cast<SCEVConstant>(Step))
3095     return C->getValue();
3096   if (auto *U = dyn_cast<SCEVUnknown>(Step))
3097     return U->getValue();
3098   auto I = ExpandedSCEVs.find(Step);
3099   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3100   return I->second;
3101 }
3102 
3103 void InnerLoopVectorizer::createInductionResumeValues(
3104     const SCEV2ValueTy &ExpandedSCEVs,
3105     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3106   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3107           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3108          "Inconsistent information about additional bypass.");
3109   // We are going to resume the execution of the scalar loop.
3110   // Go over all of the induction variables that we found and fix the
3111   // PHIs that are left in the scalar version of the loop.
3112   // The starting values of PHI nodes depend on the counter of the last
3113   // iteration in the vectorized loop.
3114   // If we come from a bypass edge then we need to start from the original
3115   // start value.
3116   for (const auto &InductionEntry : Legal->getInductionVars()) {
3117     PHINode *OrigPhi = InductionEntry.first;
3118     const InductionDescriptor &II = InductionEntry.second;
3119     PHINode *BCResumeVal = createInductionResumeValue(
3120         OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3121         AdditionalBypass);
3122     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3123   }
3124 }
3125 
3126 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3127   // The trip counts should be cached by now.
3128   Value *Count = getTripCount();
3129   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3130 
3131   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3132 
3133   // Add a check in the middle block to see if we have completed
3134   // all of the iterations in the first vector loop.  Three cases:
3135   // 1) If we require a scalar epilogue, there is no conditional branch as
3136   //    we unconditionally branch to the scalar preheader.  Do nothing.
3137   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3138   //    Thus if tail is to be folded, we know we don't need to run the
3139   //    remainder and we can use the previous value for the condition (true).
3140   // 3) Otherwise, construct a runtime check.
3141   if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3142       !Cost->foldTailByMasking()) {
3143     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3144     // of the corresponding compare because they may have ended up with
3145     // different line numbers and we want to avoid awkward line stepping while
3146     // debugging. Eg. if the compare has got a line number inside the loop.
3147     // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3148     // operands. Perform simplification directly on VPlan once the branch is
3149     // modeled there.
3150     IRBuilder<> B(LoopMiddleBlock->getTerminator());
3151     B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3152     Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3153     BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3154     BI.setCondition(CmpN);
3155     if (hasBranchWeightMD(*ScalarLatchTerm)) {
3156       // Assume that `Count % VectorTripCount` is equally distributed.
3157       unsigned TripCount = UF * VF.getKnownMinValue();
3158       assert(TripCount > 0 && "trip count should not be zero");
3159       const uint32_t Weights[] = {1, TripCount - 1};
3160       setBranchWeights(BI, Weights);
3161     }
3162   }
3163 
3164 #ifdef EXPENSIVE_CHECKS
3165   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3166 #endif
3167 
3168   return LoopVectorPreHeader;
3169 }
3170 
3171 std::pair<BasicBlock *, Value *>
3172 InnerLoopVectorizer::createVectorizedLoopSkeleton(
3173     const SCEV2ValueTy &ExpandedSCEVs) {
3174   /*
3175    In this function we generate a new loop. The new loop will contain
3176    the vectorized instructions while the old loop will continue to run the
3177    scalar remainder.
3178 
3179        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3180      /  |      preheader are expanded here. Eventually all required SCEV
3181     /   |      expansion should happen here.
3182    /    v
3183   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3184   |  /  |
3185   | /   v
3186   ||   [ ]     <-- vector pre header.
3187   |/    |
3188   |     v
3189   |    [  ] \
3190   |    [  ]_|   <-- vector loop (created during VPlan execution).
3191   |     |
3192   |     v
3193   \   -[ ]   <--- middle-block.
3194    \/   |
3195    /\   v
3196    | ->[ ]     <--- new preheader.
3197    |    |
3198  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3199    |   [ ] \
3200    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3201     \   |
3202      \  v
3203       >[ ]     <-- exit block(s).
3204    ...
3205    */
3206 
3207   // Create an empty vector loop, and prepare basic blocks for the runtime
3208   // checks.
3209   createVectorLoopSkeleton("");
3210 
3211   // Now, compare the new count to zero. If it is zero skip the vector loop and
3212   // jump to the scalar loop. This check also covers the case where the
3213   // backedge-taken count is uint##_max: adding one to it will overflow leading
3214   // to an incorrect trip count of zero. In this (rare) case we will also jump
3215   // to the scalar loop.
3216   emitIterationCountCheck(LoopScalarPreHeader);
3217 
3218   // Generate the code to check any assumptions that we've made for SCEV
3219   // expressions.
3220   emitSCEVChecks(LoopScalarPreHeader);
3221 
3222   // Generate the code that checks in runtime if arrays overlap. We put the
3223   // checks into a separate block to make the more common case of few elements
3224   // faster.
3225   emitMemRuntimeChecks(LoopScalarPreHeader);
3226 
3227   // Emit phis for the new starting index of the scalar loop.
3228   createInductionResumeValues(ExpandedSCEVs);
3229 
3230   return {completeLoopSkeleton(), nullptr};
3231 }
3232 
3233 // Fix up external users of the induction variable. At this point, we are
3234 // in LCSSA form, with all external PHIs that use the IV having one input value,
3235 // coming from the remainder loop. We need those PHIs to also have a correct
3236 // value for the IV when arriving directly from the middle block.
3237 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3238                                        const InductionDescriptor &II,
3239                                        Value *VectorTripCount, Value *EndValue,
3240                                        BasicBlock *MiddleBlock,
3241                                        BasicBlock *VectorHeader, VPlan &Plan,
3242                                        VPTransformState &State) {
3243   // There are two kinds of external IV usages - those that use the value
3244   // computed in the last iteration (the PHI) and those that use the penultimate
3245   // value (the value that feeds into the phi from the loop latch).
3246   // We allow both, but they, obviously, have different values.
3247 
3248   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3249 
3250   DenseMap<Value *, Value *> MissingVals;
3251 
3252   // An external user of the last iteration's value should see the value that
3253   // the remainder loop uses to initialize its own IV.
3254   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3255   for (User *U : PostInc->users()) {
3256     Instruction *UI = cast<Instruction>(U);
3257     if (!OrigLoop->contains(UI)) {
3258       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3259       MissingVals[UI] = EndValue;
3260     }
3261   }
3262 
3263   // An external user of the penultimate value need to see EndValue - Step.
3264   // The simplest way to get this is to recompute it from the constituent SCEVs,
3265   // that is Start + (Step * (CRD - 1)).
3266   for (User *U : OrigPhi->users()) {
3267     auto *UI = cast<Instruction>(U);
3268     if (!OrigLoop->contains(UI)) {
3269       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3270       IRBuilder<> B(MiddleBlock->getTerminator());
3271 
3272       // Fast-math-flags propagate from the original induction instruction.
3273       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3274         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3275 
3276       Value *CountMinusOne = B.CreateSub(
3277           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3278       CountMinusOne->setName("cmo");
3279 
3280       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3281       assert(StepVPV && "step must have been expanded during VPlan execution");
3282       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3283                                         : State.get(StepVPV, {0, 0});
3284       Value *Escape =
3285           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3286                                II.getKind(), II.getInductionBinOp());
3287       Escape->setName("ind.escape");
3288       MissingVals[UI] = Escape;
3289     }
3290   }
3291 
3292   for (auto &I : MissingVals) {
3293     PHINode *PHI = cast<PHINode>(I.first);
3294     // One corner case we have to handle is two IVs "chasing" each-other,
3295     // that is %IV2 = phi [...], [ %IV1, %latch ]
3296     // In this case, if IV1 has an external use, we need to avoid adding both
3297     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3298     // don't already have an incoming value for the middle block.
3299     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3300       PHI->addIncoming(I.second, MiddleBlock);
3301       Plan.removeLiveOut(PHI);
3302     }
3303   }
3304 }
3305 
3306 namespace {
3307 
3308 struct CSEDenseMapInfo {
3309   static bool canHandle(const Instruction *I) {
3310     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3311            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3312   }
3313 
3314   static inline Instruction *getEmptyKey() {
3315     return DenseMapInfo<Instruction *>::getEmptyKey();
3316   }
3317 
3318   static inline Instruction *getTombstoneKey() {
3319     return DenseMapInfo<Instruction *>::getTombstoneKey();
3320   }
3321 
3322   static unsigned getHashValue(const Instruction *I) {
3323     assert(canHandle(I) && "Unknown instruction!");
3324     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3325                                                            I->value_op_end()));
3326   }
3327 
3328   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3329     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3330         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3331       return LHS == RHS;
3332     return LHS->isIdenticalTo(RHS);
3333   }
3334 };
3335 
3336 } // end anonymous namespace
3337 
3338 ///Perform cse of induction variable instructions.
3339 static void cse(BasicBlock *BB) {
3340   // Perform simple cse.
3341   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3342   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3343     if (!CSEDenseMapInfo::canHandle(&In))
3344       continue;
3345 
3346     // Check if we can replace this instruction with any of the
3347     // visited instructions.
3348     if (Instruction *V = CSEMap.lookup(&In)) {
3349       In.replaceAllUsesWith(V);
3350       In.eraseFromParent();
3351       continue;
3352     }
3353 
3354     CSEMap[&In] = &In;
3355   }
3356 }
3357 
3358 InstructionCost
3359 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3360                                               ElementCount VF) const {
3361   // We only need to calculate a cost if the VF is scalar; for actual vectors
3362   // we should already have a pre-calculated cost at each VF.
3363   if (!VF.isScalar())
3364     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3365 
3366   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3367   Type *RetTy = CI->getType();
3368   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
3369     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3370       return *RedCost;
3371 
3372   SmallVector<Type *, 4> Tys;
3373   for (auto &ArgOp : CI->args())
3374     Tys.push_back(ArgOp->getType());
3375 
3376   InstructionCost ScalarCallCost =
3377       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
3378 
3379   // If this is an intrinsic we may have a lower cost for it.
3380   if (getVectorIntrinsicIDForCall(CI, TLI)) {
3381     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3382     return std::min(ScalarCallCost, IntrinsicCost);
3383   }
3384   return ScalarCallCost;
3385 }
3386 
3387 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3388   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3389     return Elt;
3390   return VectorType::get(Elt, VF);
3391 }
3392 
3393 InstructionCost
3394 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3395                                                    ElementCount VF) const {
3396   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3397   assert(ID && "Expected intrinsic call!");
3398   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3399   FastMathFlags FMF;
3400   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3401     FMF = FPMO->getFastMathFlags();
3402 
3403   SmallVector<const Value *> Arguments(CI->args());
3404   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3405   SmallVector<Type *> ParamTys;
3406   std::transform(FTy->param_begin(), FTy->param_end(),
3407                  std::back_inserter(ParamTys),
3408                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3409 
3410   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3411                                     dyn_cast<IntrinsicInst>(CI));
3412   return TTI.getIntrinsicInstrCost(CostAttrs,
3413                                    TargetTransformInfo::TCK_RecipThroughput);
3414 }
3415 
3416 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3417   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3418   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3419   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3420 }
3421 
3422 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3423   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3424   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3425   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3426 }
3427 
3428 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3429                                             VPlan &Plan) {
3430   // Fix widened non-induction PHIs by setting up the PHI operands.
3431   if (EnableVPlanNativePath)
3432     fixNonInductionPHIs(Plan, State);
3433 
3434   // At this point every instruction in the original loop is widened to a
3435   // vector form. Now we need to fix the recurrences in the loop. These PHI
3436   // nodes are currently empty because we did not want to introduce cycles.
3437   // This is the second stage of vectorizing recurrences.
3438   fixCrossIterationPHIs(State);
3439 
3440   // Forget the original basic block.
3441   PSE.getSE()->forgetLoop(OrigLoop);
3442   PSE.getSE()->forgetBlockAndLoopDispositions();
3443 
3444   // After vectorization, the exit blocks of the original loop will have
3445   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3446   // looked through single-entry phis.
3447   SmallVector<BasicBlock *> ExitBlocks;
3448   OrigLoop->getExitBlocks(ExitBlocks);
3449   for (BasicBlock *Exit : ExitBlocks)
3450     for (PHINode &PN : Exit->phis())
3451       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3452 
3453   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3454   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3455   if (Cost->requiresScalarEpilogue(VF.isVector())) {
3456     // No edge from the middle block to the unique exit block has been inserted
3457     // and there is nothing to fix from vector loop; phis should have incoming
3458     // from scalar loop only.
3459   } else {
3460     // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3461     // the cost model.
3462 
3463     // If we inserted an edge from the middle block to the unique exit block,
3464     // update uses outside the loop (phis) to account for the newly inserted
3465     // edge.
3466 
3467     // Fix-up external users of the induction variables.
3468     for (const auto &Entry : Legal->getInductionVars())
3469       fixupIVUsers(Entry.first, Entry.second,
3470                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3471                    IVEndValues[Entry.first], LoopMiddleBlock,
3472                    VectorLoop->getHeader(), Plan, State);
3473   }
3474 
3475   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3476   // in the exit block, so update the builder.
3477   State.Builder.SetInsertPoint(State.CFG.ExitBB,
3478                                State.CFG.ExitBB->getFirstNonPHIIt());
3479   for (const auto &KV : Plan.getLiveOuts())
3480     KV.second->fixPhi(Plan, State);
3481 
3482   for (Instruction *PI : PredicatedInstructions)
3483     sinkScalarOperands(&*PI);
3484 
3485   // Remove redundant induction instructions.
3486   cse(VectorLoop->getHeader());
3487 
3488   // Set/update profile weights for the vector and remainder loops as original
3489   // loop iterations are now distributed among them. Note that original loop
3490   // represented by LoopScalarBody becomes remainder loop after vectorization.
3491   //
3492   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3493   // end up getting slightly roughened result but that should be OK since
3494   // profile is not inherently precise anyway. Note also possible bypass of
3495   // vector code caused by legality checks is ignored, assigning all the weight
3496   // to the vector loop, optimistically.
3497   //
3498   // For scalable vectorization we can't know at compile time how many iterations
3499   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3500   // vscale of '1'.
3501   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3502                                LI->getLoopFor(LoopScalarBody),
3503                                VF.getKnownMinValue() * UF);
3504 }
3505 
3506 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3507   // In order to support recurrences we need to be able to vectorize Phi nodes.
3508   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3509   // stage #2: We now need to fix the recurrences by adding incoming edges to
3510   // the currently empty PHI nodes. At this point every instruction in the
3511   // original loop is widened to a vector form so we can use them to construct
3512   // the incoming edges.
3513   VPBasicBlock *Header =
3514       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3515 
3516   for (VPRecipeBase &R : Header->phis()) {
3517     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3518       fixReduction(ReductionPhi, State);
3519   }
3520 
3521   for (VPRecipeBase &R : Header->phis()) {
3522     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3523       fixFixedOrderRecurrence(FOR, State);
3524   }
3525 }
3526 
3527 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3528     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3529   // This is the second phase of vectorizing first-order recurrences. An
3530   // overview of the transformation is described below. Suppose we have the
3531   // following loop.
3532   //
3533   //   for (int i = 0; i < n; ++i)
3534   //     b[i] = a[i] - a[i - 1];
3535   //
3536   // There is a first-order recurrence on "a". For this loop, the shorthand
3537   // scalar IR looks like:
3538   //
3539   //   scalar.ph:
3540   //     s_init = a[-1]
3541   //     br scalar.body
3542   //
3543   //   scalar.body:
3544   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3545   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3546   //     s2 = a[i]
3547   //     b[i] = s2 - s1
3548   //     br cond, scalar.body, ...
3549   //
3550   // In this example, s1 is a recurrence because it's value depends on the
3551   // previous iteration. In the first phase of vectorization, we created a
3552   // vector phi v1 for s1. We now complete the vectorization and produce the
3553   // shorthand vector IR shown below (for VF = 4, UF = 1).
3554   //
3555   //   vector.ph:
3556   //     v_init = vector(..., ..., ..., a[-1])
3557   //     br vector.body
3558   //
3559   //   vector.body
3560   //     i = phi [0, vector.ph], [i+4, vector.body]
3561   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3562   //     v2 = a[i, i+1, i+2, i+3];
3563   //     v3 = vector(v1(3), v2(0, 1, 2))
3564   //     b[i, i+1, i+2, i+3] = v2 - v3
3565   //     br cond, vector.body, middle.block
3566   //
3567   //   middle.block:
3568   //     x = v2(3)
3569   //     br scalar.ph
3570   //
3571   //   scalar.ph:
3572   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3573   //     br scalar.body
3574   //
3575   // After execution completes the vector loop, we extract the next value of
3576   // the recurrence (x) to use as the initial value in the scalar loop.
3577 
3578   // Extract the last vector element in the middle block. This will be the
3579   // initial value for the recurrence when jumping to the scalar loop.
3580   VPValue *PreviousDef = PhiR->getBackedgeValue();
3581   Value *Incoming = State.get(PreviousDef, UF - 1);
3582   auto *ExtractForScalar = Incoming;
3583   auto *IdxTy = Builder.getInt32Ty();
3584   Value *RuntimeVF = nullptr;
3585   if (VF.isVector()) {
3586     auto *One = ConstantInt::get(IdxTy, 1);
3587     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3588     RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3589     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3590     ExtractForScalar =
3591         Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3592   }
3593 
3594   auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3595   assert(PhiR->getNumUsers() == 1 &&
3596          RecurSplice->getOpcode() ==
3597              VPInstruction::FirstOrderRecurrenceSplice &&
3598          "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3599   SmallVector<VPLiveOut *> LiveOuts;
3600   for (VPUser *U : RecurSplice->users())
3601     if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3602       LiveOuts.push_back(LiveOut);
3603 
3604   if (!LiveOuts.empty()) {
3605     // Extract the second last element in the middle block if the
3606     // Phi is used outside the loop. We need to extract the phi itself
3607     // and not the last element (the phi update in the current iteration). This
3608     // will be the value when jumping to the exit block from the
3609     // LoopMiddleBlock, when the scalar loop is not run at all.
3610     Value *ExtractForPhiUsedOutsideLoop = nullptr;
3611     if (VF.isVector()) {
3612       auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3613       ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3614           Incoming, Idx, "vector.recur.extract.for.phi");
3615     } else {
3616       assert(UF > 1 && "VF and UF cannot both be 1");
3617       // When loop is unrolled without vectorizing, initialize
3618       // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3619       // value of `Incoming`. This is analogous to the vectorized case above:
3620       // extracting the second last element when VF > 1.
3621       ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3622     }
3623 
3624     for (VPLiveOut *LiveOut : LiveOuts) {
3625       assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3626       PHINode *LCSSAPhi = LiveOut->getPhi();
3627       LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3628       State.Plan->removeLiveOut(LCSSAPhi);
3629     }
3630   }
3631 
3632   // Fix the initial value of the original recurrence in the scalar loop.
3633   Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
3634   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3635   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3636   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3637   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3638     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3639     Start->addIncoming(Incoming, BB);
3640   }
3641 
3642   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3643   Phi->setName("scalar.recur");
3644 }
3645 
3646 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3647                                        VPTransformState &State) {
3648   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3649   // Get it's reduction variable descriptor.
3650   assert(Legal->isReductionVariable(OrigPhi) &&
3651          "Unable to find the reduction variable");
3652   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3653 
3654   RecurKind RK = RdxDesc.getRecurrenceKind();
3655   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3656   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3657   if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
3658     State.setDebugLocFrom(I->getDebugLoc());
3659 
3660   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3661 
3662   // Before each round, move the insertion point right between
3663   // the PHIs and the values we are going to write.
3664   // This allows us to write both PHINodes and the extractelement
3665   // instructions.
3666   Builder.SetInsertPoint(LoopMiddleBlock,
3667                          LoopMiddleBlock->getFirstInsertionPt());
3668 
3669   State.setDebugLocFrom(LoopExitInst->getDebugLoc());
3670 
3671   Type *PhiTy = OrigPhi->getType();
3672   // If tail is folded by masking, the vector value to leave the loop should be
3673   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3674   // instead of the former. For an inloop reduction the reduction will already
3675   // be predicated, and does not need to be handled here.
3676   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3677     VPValue *Def = nullptr;
3678     for (VPUser *U : LoopExitInstDef->users()) {
3679       auto *S = dyn_cast<VPInstruction>(U);
3680       if (S && S->getOpcode() == Instruction::Select) {
3681         Def = S;
3682         break;
3683       }
3684     }
3685     if (Def)
3686       LoopExitInstDef = Def;
3687   }
3688 
3689   VectorParts RdxParts(UF);
3690   for (unsigned Part = 0; Part < UF; ++Part)
3691     RdxParts[Part] = State.get(LoopExitInstDef, Part);
3692 
3693   // If the vector reduction can be performed in a smaller type, we truncate
3694   // then extend the loop exit value to enable InstCombine to evaluate the
3695   // entire expression in the smaller type.
3696   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3697     Builder.SetInsertPoint(LoopMiddleBlock,
3698                            LoopMiddleBlock->getFirstInsertionPt());
3699     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3700     for (unsigned Part = 0; Part < UF; ++Part) {
3701       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3702     }
3703   }
3704 
3705   // Reduce all of the unrolled parts into a single vector.
3706   Value *ReducedPartRdx = RdxParts[0];
3707   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3708 
3709   // The middle block terminator has already been assigned a DebugLoc here (the
3710   // OrigLoop's single latch terminator). We want the whole middle block to
3711   // appear to execute on this line because: (a) it is all compiler generated,
3712   // (b) these instructions are always executed after evaluating the latch
3713   // conditional branch, and (c) other passes may add new predecessors which
3714   // terminate on this line. This is the easiest way to ensure we don't
3715   // accidentally cause an extra step back into the loop while debugging.
3716   State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
3717   if (PhiR->isOrdered())
3718     ReducedPartRdx = RdxParts[UF - 1];
3719   else {
3720     // Floating-point operations should have some FMF to enable the reduction.
3721     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3722     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3723     for (unsigned Part = 1; Part < UF; ++Part) {
3724       Value *RdxPart = RdxParts[Part];
3725       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3726         ReducedPartRdx = Builder.CreateBinOp(
3727             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3728       else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
3729         ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
3730                                        ReducedPartRdx, RdxPart);
3731       else
3732         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3733     }
3734   }
3735 
3736   // Create the reduction after the loop. Note that inloop reductions create the
3737   // target reduction in the loop using a Reduction recipe.
3738   if (VF.isVector() && !PhiR->isInLoop()) {
3739     ReducedPartRdx =
3740         createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
3741     // If the reduction can be performed in a smaller type, we need to extend
3742     // the reduction to the wider type before we branch to the original loop.
3743     if (PhiTy != RdxDesc.getRecurrenceType())
3744       ReducedPartRdx = RdxDesc.isSigned()
3745                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3746                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3747   }
3748 
3749   PHINode *ResumePhi =
3750       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3751 
3752   // Create a phi node that merges control-flow from the backedge-taken check
3753   // block and the middle block.
3754   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3755                                         LoopScalarPreHeader->getTerminator());
3756 
3757   // If we are fixing reductions in the epilogue loop then we should already
3758   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
3759   // we carry over the incoming values correctly.
3760   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
3761     if (Incoming == LoopMiddleBlock)
3762       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
3763     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
3764       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
3765                               Incoming);
3766     else
3767       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
3768   }
3769 
3770   // Set the resume value for this reduction
3771   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
3772 
3773   // If there were stores of the reduction value to a uniform memory address
3774   // inside the loop, create the final store here.
3775   if (StoreInst *SI = RdxDesc.IntermediateStore) {
3776     StoreInst *NewSI =
3777         Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),
3778                                    SI->getAlign());
3779     propagateMetadata(NewSI, SI);
3780 
3781     // If the reduction value is used in other places,
3782     // then let the code below create PHI's for that.
3783   }
3784 
3785   // Now, we need to fix the users of the reduction variable
3786   // inside and outside of the scalar remainder loop.
3787 
3788   // We know that the loop is in LCSSA form. We need to update the PHI nodes
3789   // in the exit blocks.  See comment on analogous loop in
3790   // fixFixedOrderRecurrence for a more complete explaination of the logic.
3791   if (!Cost->requiresScalarEpilogue(VF.isVector()))
3792     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3793       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
3794         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3795         State.Plan->removeLiveOut(&LCSSAPhi);
3796       }
3797 
3798   // Fix the scalar loop reduction variable with the incoming reduction sum
3799   // from the vector body and from the backedge value.
3800   int IncomingEdgeBlockIdx =
3801       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3802   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3803   // Pick the other block.
3804   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3805   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3806   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3807 }
3808 
3809 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3810   // The basic block and loop containing the predicated instruction.
3811   auto *PredBB = PredInst->getParent();
3812   auto *VectorLoop = LI->getLoopFor(PredBB);
3813 
3814   // Initialize a worklist with the operands of the predicated instruction.
3815   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3816 
3817   // Holds instructions that we need to analyze again. An instruction may be
3818   // reanalyzed if we don't yet know if we can sink it or not.
3819   SmallVector<Instruction *, 8> InstsToReanalyze;
3820 
3821   // Returns true if a given use occurs in the predicated block. Phi nodes use
3822   // their operands in their corresponding predecessor blocks.
3823   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3824     auto *I = cast<Instruction>(U.getUser());
3825     BasicBlock *BB = I->getParent();
3826     if (auto *Phi = dyn_cast<PHINode>(I))
3827       BB = Phi->getIncomingBlock(
3828           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3829     return BB == PredBB;
3830   };
3831 
3832   // Iteratively sink the scalarized operands of the predicated instruction
3833   // into the block we created for it. When an instruction is sunk, it's
3834   // operands are then added to the worklist. The algorithm ends after one pass
3835   // through the worklist doesn't sink a single instruction.
3836   bool Changed;
3837   do {
3838     // Add the instructions that need to be reanalyzed to the worklist, and
3839     // reset the changed indicator.
3840     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3841     InstsToReanalyze.clear();
3842     Changed = false;
3843 
3844     while (!Worklist.empty()) {
3845       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3846 
3847       // We can't sink an instruction if it is a phi node, is not in the loop,
3848       // may have side effects or may read from memory.
3849       // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3850       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3851           I->mayHaveSideEffects() || I->mayReadFromMemory())
3852           continue;
3853 
3854       // If the instruction is already in PredBB, check if we can sink its
3855       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3856       // sinking the scalar instruction I, hence it appears in PredBB; but it
3857       // may have failed to sink I's operands (recursively), which we try
3858       // (again) here.
3859       if (I->getParent() == PredBB) {
3860         Worklist.insert(I->op_begin(), I->op_end());
3861         continue;
3862       }
3863 
3864       // It's legal to sink the instruction if all its uses occur in the
3865       // predicated block. Otherwise, there's nothing to do yet, and we may
3866       // need to reanalyze the instruction.
3867       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3868         InstsToReanalyze.push_back(I);
3869         continue;
3870       }
3871 
3872       // Move the instruction to the beginning of the predicated block, and add
3873       // it's operands to the worklist.
3874       I->moveBefore(&*PredBB->getFirstInsertionPt());
3875       Worklist.insert(I->op_begin(), I->op_end());
3876 
3877       // The sinking may have enabled other instructions to be sunk, so we will
3878       // need to iterate.
3879       Changed = true;
3880     }
3881   } while (Changed);
3882 }
3883 
3884 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3885                                               VPTransformState &State) {
3886   auto Iter = vp_depth_first_deep(Plan.getEntry());
3887   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3888     for (VPRecipeBase &P : VPBB->phis()) {
3889       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3890       if (!VPPhi)
3891         continue;
3892       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3893       // Make sure the builder has a valid insert point.
3894       Builder.SetInsertPoint(NewPhi);
3895       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3896         VPValue *Inc = VPPhi->getIncomingValue(i);
3897         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3898         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3899       }
3900     }
3901   }
3902 }
3903 
3904 bool InnerLoopVectorizer::useOrderedReductions(
3905     const RecurrenceDescriptor &RdxDesc) {
3906   return Cost->useOrderedReductions(RdxDesc);
3907 }
3908 
3909 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3910   // We should not collect Scalars more than once per VF. Right now, this
3911   // function is called from collectUniformsAndScalars(), which already does
3912   // this check. Collecting Scalars for VF=1 does not make any sense.
3913   assert(VF.isVector() && !Scalars.contains(VF) &&
3914          "This function should not be visited twice for the same VF");
3915 
3916   // This avoids any chances of creating a REPLICATE recipe during planning
3917   // since that would result in generation of scalarized code during execution,
3918   // which is not supported for scalable vectors.
3919   if (VF.isScalable()) {
3920     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3921     return;
3922   }
3923 
3924   SmallSetVector<Instruction *, 8> Worklist;
3925 
3926   // These sets are used to seed the analysis with pointers used by memory
3927   // accesses that will remain scalar.
3928   SmallSetVector<Instruction *, 8> ScalarPtrs;
3929   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3930   auto *Latch = TheLoop->getLoopLatch();
3931 
3932   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3933   // The pointer operands of loads and stores will be scalar as long as the
3934   // memory access is not a gather or scatter operation. The value operand of a
3935   // store will remain scalar if the store is scalarized.
3936   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3937     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3938     assert(WideningDecision != CM_Unknown &&
3939            "Widening decision should be ready at this moment");
3940     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3941       if (Ptr == Store->getValueOperand())
3942         return WideningDecision == CM_Scalarize;
3943     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3944            "Ptr is neither a value or pointer operand");
3945     return WideningDecision != CM_GatherScatter;
3946   };
3947 
3948   // A helper that returns true if the given value is a bitcast or
3949   // getelementptr instruction contained in the loop.
3950   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3951     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3952             isa<GetElementPtrInst>(V)) &&
3953            !TheLoop->isLoopInvariant(V);
3954   };
3955 
3956   // A helper that evaluates a memory access's use of a pointer. If the use will
3957   // be a scalar use and the pointer is only used by memory accesses, we place
3958   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3959   // PossibleNonScalarPtrs.
3960   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3961     // We only care about bitcast and getelementptr instructions contained in
3962     // the loop.
3963     if (!isLoopVaryingBitCastOrGEP(Ptr))
3964       return;
3965 
3966     // If the pointer has already been identified as scalar (e.g., if it was
3967     // also identified as uniform), there's nothing to do.
3968     auto *I = cast<Instruction>(Ptr);
3969     if (Worklist.count(I))
3970       return;
3971 
3972     // If the use of the pointer will be a scalar use, and all users of the
3973     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3974     // place the pointer in PossibleNonScalarPtrs.
3975     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3976           return isa<LoadInst>(U) || isa<StoreInst>(U);
3977         }))
3978       ScalarPtrs.insert(I);
3979     else
3980       PossibleNonScalarPtrs.insert(I);
3981   };
3982 
3983   // We seed the scalars analysis with three classes of instructions: (1)
3984   // instructions marked uniform-after-vectorization and (2) bitcast,
3985   // getelementptr and (pointer) phi instructions used by memory accesses
3986   // requiring a scalar use.
3987   //
3988   // (1) Add to the worklist all instructions that have been identified as
3989   // uniform-after-vectorization.
3990   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3991 
3992   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3993   // memory accesses requiring a scalar use. The pointer operands of loads and
3994   // stores will be scalar as long as the memory accesses is not a gather or
3995   // scatter operation. The value operand of a store will remain scalar if the
3996   // store is scalarized.
3997   for (auto *BB : TheLoop->blocks())
3998     for (auto &I : *BB) {
3999       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4000         evaluatePtrUse(Load, Load->getPointerOperand());
4001       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4002         evaluatePtrUse(Store, Store->getPointerOperand());
4003         evaluatePtrUse(Store, Store->getValueOperand());
4004       }
4005     }
4006   for (auto *I : ScalarPtrs)
4007     if (!PossibleNonScalarPtrs.count(I)) {
4008       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4009       Worklist.insert(I);
4010     }
4011 
4012   // Insert the forced scalars.
4013   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4014   // induction variable when the PHI user is scalarized.
4015   auto ForcedScalar = ForcedScalars.find(VF);
4016   if (ForcedScalar != ForcedScalars.end())
4017     for (auto *I : ForcedScalar->second) {
4018       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
4019       Worklist.insert(I);
4020     }
4021 
4022   // Expand the worklist by looking through any bitcasts and getelementptr
4023   // instructions we've already identified as scalar. This is similar to the
4024   // expansion step in collectLoopUniforms(); however, here we're only
4025   // expanding to include additional bitcasts and getelementptr instructions.
4026   unsigned Idx = 0;
4027   while (Idx != Worklist.size()) {
4028     Instruction *Dst = Worklist[Idx++];
4029     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4030       continue;
4031     auto *Src = cast<Instruction>(Dst->getOperand(0));
4032     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4033           auto *J = cast<Instruction>(U);
4034           return !TheLoop->contains(J) || Worklist.count(J) ||
4035                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4036                   isScalarUse(J, Src));
4037         })) {
4038       Worklist.insert(Src);
4039       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4040     }
4041   }
4042 
4043   // An induction variable will remain scalar if all users of the induction
4044   // variable and induction variable update remain scalar.
4045   for (const auto &Induction : Legal->getInductionVars()) {
4046     auto *Ind = Induction.first;
4047     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4048 
4049     // If tail-folding is applied, the primary induction variable will be used
4050     // to feed a vector compare.
4051     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4052       continue;
4053 
4054     // Returns true if \p Indvar is a pointer induction that is used directly by
4055     // load/store instruction \p I.
4056     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4057                                               Instruction *I) {
4058       return Induction.second.getKind() ==
4059                  InductionDescriptor::IK_PtrInduction &&
4060              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4061              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4062     };
4063 
4064     // Determine if all users of the induction variable are scalar after
4065     // vectorization.
4066     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4067       auto *I = cast<Instruction>(U);
4068       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4069              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4070     });
4071     if (!ScalarInd)
4072       continue;
4073 
4074     // Determine if all users of the induction variable update instruction are
4075     // scalar after vectorization.
4076     auto ScalarIndUpdate =
4077         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4078           auto *I = cast<Instruction>(U);
4079           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4080                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4081         });
4082     if (!ScalarIndUpdate)
4083       continue;
4084 
4085     // The induction variable and its update instruction will remain scalar.
4086     Worklist.insert(Ind);
4087     Worklist.insert(IndUpdate);
4088     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4089     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4090                       << "\n");
4091   }
4092 
4093   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4094 }
4095 
4096 bool LoopVectorizationCostModel::isScalarWithPredication(
4097     Instruction *I, ElementCount VF) const {
4098   if (!isPredicatedInst(I))
4099     return false;
4100 
4101   // Do we have a non-scalar lowering for this predicated
4102   // instruction? No - it is scalar with predication.
4103   switch(I->getOpcode()) {
4104   default:
4105     return true;
4106   case Instruction::Call:
4107     if (VF.isScalar())
4108       return true;
4109     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
4110                .Kind == CM_Scalarize;
4111   case Instruction::Load:
4112   case Instruction::Store: {
4113     auto *Ptr = getLoadStorePointerOperand(I);
4114     auto *Ty = getLoadStoreType(I);
4115     Type *VTy = Ty;
4116     if (VF.isVector())
4117       VTy = VectorType::get(Ty, VF);
4118     const Align Alignment = getLoadStoreAlignment(I);
4119     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4120                                 TTI.isLegalMaskedGather(VTy, Alignment))
4121                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4122                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4123   }
4124   case Instruction::UDiv:
4125   case Instruction::SDiv:
4126   case Instruction::SRem:
4127   case Instruction::URem: {
4128     // We have the option to use the safe-divisor idiom to avoid predication.
4129     // The cost based decision here will always select safe-divisor for
4130     // scalable vectors as scalarization isn't legal.
4131     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4132     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4133   }
4134   }
4135 }
4136 
4137 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4138   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4139     return false;
4140 
4141   // Can we prove this instruction is safe to unconditionally execute?
4142   // If not, we must use some form of predication.
4143   switch(I->getOpcode()) {
4144   default:
4145     return false;
4146   case Instruction::Load:
4147   case Instruction::Store: {
4148     if (!Legal->isMaskRequired(I))
4149       return false;
4150     // When we know the load's address is loop invariant and the instruction
4151     // in the original scalar loop was unconditionally executed then we
4152     // don't need to mark it as a predicated instruction. Tail folding may
4153     // introduce additional predication, but we're guaranteed to always have
4154     // at least one active lane.  We call Legal->blockNeedsPredication here
4155     // because it doesn't query tail-folding.  For stores, we need to prove
4156     // both speculation safety (which follows from the same argument as loads),
4157     // but also must prove the value being stored is correct.  The easiest
4158     // form of the later is to require that all values stored are the same.
4159     if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
4160         (isa<LoadInst>(I) ||
4161          (isa<StoreInst>(I) &&
4162           TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4163         !Legal->blockNeedsPredication(I->getParent()))
4164       return false;
4165     return true;
4166   }
4167   case Instruction::UDiv:
4168   case Instruction::SDiv:
4169   case Instruction::SRem:
4170   case Instruction::URem:
4171     // TODO: We can use the loop-preheader as context point here and get
4172     // context sensitive reasoning
4173     return !isSafeToSpeculativelyExecute(I);
4174   case Instruction::Call:
4175     return Legal->isMaskRequired(I);
4176   }
4177 }
4178 
4179 std::pair<InstructionCost, InstructionCost>
4180 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4181                                                     ElementCount VF) const {
4182   assert(I->getOpcode() == Instruction::UDiv ||
4183          I->getOpcode() == Instruction::SDiv ||
4184          I->getOpcode() == Instruction::SRem ||
4185          I->getOpcode() == Instruction::URem);
4186   assert(!isSafeToSpeculativelyExecute(I));
4187 
4188   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4189 
4190   // Scalarization isn't legal for scalable vector types
4191   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4192   if (!VF.isScalable()) {
4193     // Get the scalarization cost and scale this amount by the probability of
4194     // executing the predicated block. If the instruction is not predicated,
4195     // we fall through to the next case.
4196     ScalarizationCost = 0;
4197 
4198     // These instructions have a non-void type, so account for the phi nodes
4199     // that we will create. This cost is likely to be zero. The phi node
4200     // cost, if any, should be scaled by the block probability because it
4201     // models a copy at the end of each predicated block.
4202     ScalarizationCost += VF.getKnownMinValue() *
4203       TTI.getCFInstrCost(Instruction::PHI, CostKind);
4204 
4205     // The cost of the non-predicated instruction.
4206     ScalarizationCost += VF.getKnownMinValue() *
4207       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4208 
4209     // The cost of insertelement and extractelement instructions needed for
4210     // scalarization.
4211     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4212 
4213     // Scale the cost by the probability of executing the predicated blocks.
4214     // This assumes the predicated block for each vector lane is equally
4215     // likely.
4216     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4217   }
4218   InstructionCost SafeDivisorCost = 0;
4219 
4220   auto *VecTy = ToVectorTy(I->getType(), VF);
4221 
4222   // The cost of the select guard to ensure all lanes are well defined
4223   // after we speculate above any internal control flow.
4224   SafeDivisorCost += TTI.getCmpSelInstrCost(
4225     Instruction::Select, VecTy,
4226     ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4227     CmpInst::BAD_ICMP_PREDICATE, CostKind);
4228 
4229   // Certain instructions can be cheaper to vectorize if they have a constant
4230   // second vector operand. One example of this are shifts on x86.
4231   Value *Op2 = I->getOperand(1);
4232   auto Op2Info = TTI.getOperandInfo(Op2);
4233   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4234       Legal->isInvariant(Op2))
4235     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4236 
4237   SmallVector<const Value *, 4> Operands(I->operand_values());
4238   SafeDivisorCost += TTI.getArithmeticInstrCost(
4239     I->getOpcode(), VecTy, CostKind,
4240     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4241     Op2Info, Operands, I);
4242   return {ScalarizationCost, SafeDivisorCost};
4243 }
4244 
4245 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4246     Instruction *I, ElementCount VF) {
4247   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4248   assert(getWideningDecision(I, VF) == CM_Unknown &&
4249          "Decision should not be set yet.");
4250   auto *Group = getInterleavedAccessGroup(I);
4251   assert(Group && "Must have a group.");
4252 
4253   // If the instruction's allocated size doesn't equal it's type size, it
4254   // requires padding and will be scalarized.
4255   auto &DL = I->getModule()->getDataLayout();
4256   auto *ScalarTy = getLoadStoreType(I);
4257   if (hasIrregularType(ScalarTy, DL))
4258     return false;
4259 
4260   // If the group involves a non-integral pointer, we may not be able to
4261   // losslessly cast all values to a common type.
4262   unsigned InterleaveFactor = Group->getFactor();
4263   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4264   for (unsigned i = 0; i < InterleaveFactor; i++) {
4265     Instruction *Member = Group->getMember(i);
4266     if (!Member)
4267       continue;
4268     auto *MemberTy = getLoadStoreType(Member);
4269     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4270     // Don't coerce non-integral pointers to integers or vice versa.
4271     if (MemberNI != ScalarNI) {
4272       // TODO: Consider adding special nullptr value case here
4273       return false;
4274     } else if (MemberNI && ScalarNI &&
4275                ScalarTy->getPointerAddressSpace() !=
4276                MemberTy->getPointerAddressSpace()) {
4277       return false;
4278     }
4279   }
4280 
4281   // Check if masking is required.
4282   // A Group may need masking for one of two reasons: it resides in a block that
4283   // needs predication, or it was decided to use masking to deal with gaps
4284   // (either a gap at the end of a load-access that may result in a speculative
4285   // load, or any gaps in a store-access).
4286   bool PredicatedAccessRequiresMasking =
4287       blockNeedsPredicationForAnyReason(I->getParent()) &&
4288       Legal->isMaskRequired(I);
4289   bool LoadAccessWithGapsRequiresEpilogMasking =
4290       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4291       !isScalarEpilogueAllowed();
4292   bool StoreAccessWithGapsRequiresMasking =
4293       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4294   if (!PredicatedAccessRequiresMasking &&
4295       !LoadAccessWithGapsRequiresEpilogMasking &&
4296       !StoreAccessWithGapsRequiresMasking)
4297     return true;
4298 
4299   // If masked interleaving is required, we expect that the user/target had
4300   // enabled it, because otherwise it either wouldn't have been created or
4301   // it should have been invalidated by the CostModel.
4302   assert(useMaskedInterleavedAccesses(TTI) &&
4303          "Masked interleave-groups for predicated accesses are not enabled.");
4304 
4305   if (Group->isReverse())
4306     return false;
4307 
4308   auto *Ty = getLoadStoreType(I);
4309   const Align Alignment = getLoadStoreAlignment(I);
4310   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4311                           : TTI.isLegalMaskedStore(Ty, Alignment);
4312 }
4313 
4314 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4315     Instruction *I, ElementCount VF) {
4316   // Get and ensure we have a valid memory instruction.
4317   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4318 
4319   auto *Ptr = getLoadStorePointerOperand(I);
4320   auto *ScalarTy = getLoadStoreType(I);
4321 
4322   // In order to be widened, the pointer should be consecutive, first of all.
4323   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4324     return false;
4325 
4326   // If the instruction is a store located in a predicated block, it will be
4327   // scalarized.
4328   if (isScalarWithPredication(I, VF))
4329     return false;
4330 
4331   // If the instruction's allocated size doesn't equal it's type size, it
4332   // requires padding and will be scalarized.
4333   auto &DL = I->getModule()->getDataLayout();
4334   if (hasIrregularType(ScalarTy, DL))
4335     return false;
4336 
4337   return true;
4338 }
4339 
4340 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4341   // We should not collect Uniforms more than once per VF. Right now,
4342   // this function is called from collectUniformsAndScalars(), which
4343   // already does this check. Collecting Uniforms for VF=1 does not make any
4344   // sense.
4345 
4346   assert(VF.isVector() && !Uniforms.contains(VF) &&
4347          "This function should not be visited twice for the same VF");
4348 
4349   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4350   // not analyze again.  Uniforms.count(VF) will return 1.
4351   Uniforms[VF].clear();
4352 
4353   // We now know that the loop is vectorizable!
4354   // Collect instructions inside the loop that will remain uniform after
4355   // vectorization.
4356 
4357   // Global values, params and instructions outside of current loop are out of
4358   // scope.
4359   auto isOutOfScope = [&](Value *V) -> bool {
4360     Instruction *I = dyn_cast<Instruction>(V);
4361     return (!I || !TheLoop->contains(I));
4362   };
4363 
4364   // Worklist containing uniform instructions demanding lane 0.
4365   SetVector<Instruction *> Worklist;
4366   BasicBlock *Latch = TheLoop->getLoopLatch();
4367 
4368   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4369   // that are scalar with predication must not be considered uniform after
4370   // vectorization, because that would create an erroneous replicating region
4371   // where only a single instance out of VF should be formed.
4372   // TODO: optimize such seldom cases if found important, see PR40816.
4373   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4374     if (isOutOfScope(I)) {
4375       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4376                         << *I << "\n");
4377       return;
4378     }
4379     if (isScalarWithPredication(I, VF)) {
4380       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4381                         << *I << "\n");
4382       return;
4383     }
4384     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4385     Worklist.insert(I);
4386   };
4387 
4388   // Start with the conditional branch. If the branch condition is an
4389   // instruction contained in the loop that is only used by the branch, it is
4390   // uniform.
4391   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4392   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4393     addToWorklistIfAllowed(Cmp);
4394 
4395   auto PrevVF = VF.divideCoefficientBy(2);
4396   // Return true if all lanes perform the same memory operation, and we can
4397   // thus chose to execute only one.
4398   auto isUniformMemOpUse = [&](Instruction *I) {
4399     // If the value was already known to not be uniform for the previous
4400     // (smaller VF), it cannot be uniform for the larger VF.
4401     if (PrevVF.isVector()) {
4402       auto Iter = Uniforms.find(PrevVF);
4403       if (Iter != Uniforms.end() && !Iter->second.contains(I))
4404         return false;
4405     }
4406     if (!Legal->isUniformMemOp(*I, VF))
4407       return false;
4408     if (isa<LoadInst>(I))
4409       // Loading the same address always produces the same result - at least
4410       // assuming aliasing and ordering which have already been checked.
4411       return true;
4412     // Storing the same value on every iteration.
4413     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4414   };
4415 
4416   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4417     InstWidening WideningDecision = getWideningDecision(I, VF);
4418     assert(WideningDecision != CM_Unknown &&
4419            "Widening decision should be ready at this moment");
4420 
4421     if (isUniformMemOpUse(I))
4422       return true;
4423 
4424     return (WideningDecision == CM_Widen ||
4425             WideningDecision == CM_Widen_Reverse ||
4426             WideningDecision == CM_Interleave);
4427   };
4428 
4429   // Returns true if Ptr is the pointer operand of a memory access instruction
4430   // I, I is known to not require scalarization, and the pointer is not also
4431   // stored.
4432   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4433     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4434       return false;
4435     return getLoadStorePointerOperand(I) == Ptr &&
4436            (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4437   };
4438 
4439   // Holds a list of values which are known to have at least one uniform use.
4440   // Note that there may be other uses which aren't uniform.  A "uniform use"
4441   // here is something which only demands lane 0 of the unrolled iterations;
4442   // it does not imply that all lanes produce the same value (e.g. this is not
4443   // the usual meaning of uniform)
4444   SetVector<Value *> HasUniformUse;
4445 
4446   // Scan the loop for instructions which are either a) known to have only
4447   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4448   for (auto *BB : TheLoop->blocks())
4449     for (auto &I : *BB) {
4450       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4451         switch (II->getIntrinsicID()) {
4452         case Intrinsic::sideeffect:
4453         case Intrinsic::experimental_noalias_scope_decl:
4454         case Intrinsic::assume:
4455         case Intrinsic::lifetime_start:
4456         case Intrinsic::lifetime_end:
4457           if (TheLoop->hasLoopInvariantOperands(&I))
4458             addToWorklistIfAllowed(&I);
4459           break;
4460         default:
4461           break;
4462         }
4463       }
4464 
4465       // ExtractValue instructions must be uniform, because the operands are
4466       // known to be loop-invariant.
4467       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4468         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4469                "Expected aggregate value to be loop invariant");
4470         addToWorklistIfAllowed(EVI);
4471         continue;
4472       }
4473 
4474       // If there's no pointer operand, there's nothing to do.
4475       auto *Ptr = getLoadStorePointerOperand(&I);
4476       if (!Ptr)
4477         continue;
4478 
4479       if (isUniformMemOpUse(&I))
4480         addToWorklistIfAllowed(&I);
4481 
4482       if (isVectorizedMemAccessUse(&I, Ptr))
4483         HasUniformUse.insert(Ptr);
4484     }
4485 
4486   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4487   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4488   // disallows uses outside the loop as well.
4489   for (auto *V : HasUniformUse) {
4490     if (isOutOfScope(V))
4491       continue;
4492     auto *I = cast<Instruction>(V);
4493     auto UsersAreMemAccesses =
4494       llvm::all_of(I->users(), [&](User *U) -> bool {
4495         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4496       });
4497     if (UsersAreMemAccesses)
4498       addToWorklistIfAllowed(I);
4499   }
4500 
4501   // Expand Worklist in topological order: whenever a new instruction
4502   // is added , its users should be already inside Worklist.  It ensures
4503   // a uniform instruction will only be used by uniform instructions.
4504   unsigned idx = 0;
4505   while (idx != Worklist.size()) {
4506     Instruction *I = Worklist[idx++];
4507 
4508     for (auto *OV : I->operand_values()) {
4509       // isOutOfScope operands cannot be uniform instructions.
4510       if (isOutOfScope(OV))
4511         continue;
4512       // First order recurrence Phi's should typically be considered
4513       // non-uniform.
4514       auto *OP = dyn_cast<PHINode>(OV);
4515       if (OP && Legal->isFixedOrderRecurrence(OP))
4516         continue;
4517       // If all the users of the operand are uniform, then add the
4518       // operand into the uniform worklist.
4519       auto *OI = cast<Instruction>(OV);
4520       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4521             auto *J = cast<Instruction>(U);
4522             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4523           }))
4524         addToWorklistIfAllowed(OI);
4525     }
4526   }
4527 
4528   // For an instruction to be added into Worklist above, all its users inside
4529   // the loop should also be in Worklist. However, this condition cannot be
4530   // true for phi nodes that form a cyclic dependence. We must process phi
4531   // nodes separately. An induction variable will remain uniform if all users
4532   // of the induction variable and induction variable update remain uniform.
4533   // The code below handles both pointer and non-pointer induction variables.
4534   for (const auto &Induction : Legal->getInductionVars()) {
4535     auto *Ind = Induction.first;
4536     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4537 
4538     // Determine if all users of the induction variable are uniform after
4539     // vectorization.
4540     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4541       auto *I = cast<Instruction>(U);
4542       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4543              isVectorizedMemAccessUse(I, Ind);
4544     });
4545     if (!UniformInd)
4546       continue;
4547 
4548     // Determine if all users of the induction variable update instruction are
4549     // uniform after vectorization.
4550     auto UniformIndUpdate =
4551         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4552           auto *I = cast<Instruction>(U);
4553           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4554                  isVectorizedMemAccessUse(I, IndUpdate);
4555         });
4556     if (!UniformIndUpdate)
4557       continue;
4558 
4559     // The induction variable and its update instruction will remain uniform.
4560     addToWorklistIfAllowed(Ind);
4561     addToWorklistIfAllowed(IndUpdate);
4562   }
4563 
4564   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4565 }
4566 
4567 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4568   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4569 
4570   if (Legal->getRuntimePointerChecking()->Need) {
4571     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4572         "runtime pointer checks needed. Enable vectorization of this "
4573         "loop with '#pragma clang loop vectorize(enable)' when "
4574         "compiling with -Os/-Oz",
4575         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4576     return true;
4577   }
4578 
4579   if (!PSE.getPredicate().isAlwaysTrue()) {
4580     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4581         "runtime SCEV checks needed. Enable vectorization of this "
4582         "loop with '#pragma clang loop vectorize(enable)' when "
4583         "compiling with -Os/-Oz",
4584         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4585     return true;
4586   }
4587 
4588   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4589   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4590     reportVectorizationFailure("Runtime stride check for small trip count",
4591         "runtime stride == 1 checks needed. Enable vectorization of "
4592         "this loop without such check by compiling with -Os/-Oz",
4593         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4594     return true;
4595   }
4596 
4597   return false;
4598 }
4599 
4600 ElementCount
4601 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4602   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4603     return ElementCount::getScalable(0);
4604 
4605   if (Hints->isScalableVectorizationDisabled()) {
4606     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4607                             "ScalableVectorizationDisabled", ORE, TheLoop);
4608     return ElementCount::getScalable(0);
4609   }
4610 
4611   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4612 
4613   auto MaxScalableVF = ElementCount::getScalable(
4614       std::numeric_limits<ElementCount::ScalarTy>::max());
4615 
4616   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4617   // FIXME: While for scalable vectors this is currently sufficient, this should
4618   // be replaced by a more detailed mechanism that filters out specific VFs,
4619   // instead of invalidating vectorization for a whole set of VFs based on the
4620   // MaxVF.
4621 
4622   // Disable scalable vectorization if the loop contains unsupported reductions.
4623   if (!canVectorizeReductions(MaxScalableVF)) {
4624     reportVectorizationInfo(
4625         "Scalable vectorization not supported for the reduction "
4626         "operations found in this loop.",
4627         "ScalableVFUnfeasible", ORE, TheLoop);
4628     return ElementCount::getScalable(0);
4629   }
4630 
4631   // Disable scalable vectorization if the loop contains any instructions
4632   // with element types not supported for scalable vectors.
4633   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4634         return !Ty->isVoidTy() &&
4635                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4636       })) {
4637     reportVectorizationInfo("Scalable vectorization is not supported "
4638                             "for all element types found in this loop.",
4639                             "ScalableVFUnfeasible", ORE, TheLoop);
4640     return ElementCount::getScalable(0);
4641   }
4642 
4643   if (Legal->isSafeForAnyVectorWidth())
4644     return MaxScalableVF;
4645 
4646   // Limit MaxScalableVF by the maximum safe dependence distance.
4647   if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4648     MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4649   else
4650     MaxScalableVF = ElementCount::getScalable(0);
4651 
4652   if (!MaxScalableVF)
4653     reportVectorizationInfo(
4654         "Max legal vector width too small, scalable vectorization "
4655         "unfeasible.",
4656         "ScalableVFUnfeasible", ORE, TheLoop);
4657 
4658   return MaxScalableVF;
4659 }
4660 
4661 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4662     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4663   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4664   unsigned SmallestType, WidestType;
4665   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4666 
4667   // Get the maximum safe dependence distance in bits computed by LAA.
4668   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4669   // the memory accesses that is most restrictive (involved in the smallest
4670   // dependence distance).
4671   unsigned MaxSafeElements =
4672       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4673 
4674   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4675   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4676 
4677   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4678                     << ".\n");
4679   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4680                     << ".\n");
4681 
4682   // First analyze the UserVF, fall back if the UserVF should be ignored.
4683   if (UserVF) {
4684     auto MaxSafeUserVF =
4685         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4686 
4687     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4688       // If `VF=vscale x N` is safe, then so is `VF=N`
4689       if (UserVF.isScalable())
4690         return FixedScalableVFPair(
4691             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4692       else
4693         return UserVF;
4694     }
4695 
4696     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4697 
4698     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4699     // is better to ignore the hint and let the compiler choose a suitable VF.
4700     if (!UserVF.isScalable()) {
4701       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4702                         << " is unsafe, clamping to max safe VF="
4703                         << MaxSafeFixedVF << ".\n");
4704       ORE->emit([&]() {
4705         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4706                                           TheLoop->getStartLoc(),
4707                                           TheLoop->getHeader())
4708                << "User-specified vectorization factor "
4709                << ore::NV("UserVectorizationFactor", UserVF)
4710                << " is unsafe, clamping to maximum safe vectorization factor "
4711                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4712       });
4713       return MaxSafeFixedVF;
4714     }
4715 
4716     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4717       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4718                         << " is ignored because scalable vectors are not "
4719                            "available.\n");
4720       ORE->emit([&]() {
4721         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4722                                           TheLoop->getStartLoc(),
4723                                           TheLoop->getHeader())
4724                << "User-specified vectorization factor "
4725                << ore::NV("UserVectorizationFactor", UserVF)
4726                << " is ignored because the target does not support scalable "
4727                   "vectors. The compiler will pick a more suitable value.";
4728       });
4729     } else {
4730       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4731                         << " is unsafe. Ignoring scalable UserVF.\n");
4732       ORE->emit([&]() {
4733         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4734                                           TheLoop->getStartLoc(),
4735                                           TheLoop->getHeader())
4736                << "User-specified vectorization factor "
4737                << ore::NV("UserVectorizationFactor", UserVF)
4738                << " is unsafe. Ignoring the hint to let the compiler pick a "
4739                   "more suitable value.";
4740       });
4741     }
4742   }
4743 
4744   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4745                     << " / " << WidestType << " bits.\n");
4746 
4747   FixedScalableVFPair Result(ElementCount::getFixed(1),
4748                              ElementCount::getScalable(0));
4749   if (auto MaxVF =
4750           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4751                                   MaxSafeFixedVF, FoldTailByMasking))
4752     Result.FixedVF = MaxVF;
4753 
4754   if (auto MaxVF =
4755           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4756                                   MaxSafeScalableVF, FoldTailByMasking))
4757     if (MaxVF.isScalable()) {
4758       Result.ScalableVF = MaxVF;
4759       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4760                         << "\n");
4761     }
4762 
4763   return Result;
4764 }
4765 
4766 FixedScalableVFPair
4767 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4768   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4769     // TODO: It may by useful to do since it's still likely to be dynamically
4770     // uniform if the target can skip.
4771     reportVectorizationFailure(
4772         "Not inserting runtime ptr check for divergent target",
4773         "runtime pointer checks needed. Not enabled for divergent target",
4774         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4775     return FixedScalableVFPair::getNone();
4776   }
4777 
4778   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4779   unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4780   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4781   if (TC == 1) {
4782     reportVectorizationFailure("Single iteration (non) loop",
4783         "loop trip count is one, irrelevant for vectorization",
4784         "SingleIterationLoop", ORE, TheLoop);
4785     return FixedScalableVFPair::getNone();
4786   }
4787 
4788   switch (ScalarEpilogueStatus) {
4789   case CM_ScalarEpilogueAllowed:
4790     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4791   case CM_ScalarEpilogueNotAllowedUsePredicate:
4792     [[fallthrough]];
4793   case CM_ScalarEpilogueNotNeededUsePredicate:
4794     LLVM_DEBUG(
4795         dbgs() << "LV: vector predicate hint/switch found.\n"
4796                << "LV: Not allowing scalar epilogue, creating predicated "
4797                << "vector loop.\n");
4798     break;
4799   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4800     // fallthrough as a special case of OptForSize
4801   case CM_ScalarEpilogueNotAllowedOptSize:
4802     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4803       LLVM_DEBUG(
4804           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4805     else
4806       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4807                         << "count.\n");
4808 
4809     // Bail if runtime checks are required, which are not good when optimising
4810     // for size.
4811     if (runtimeChecksRequired())
4812       return FixedScalableVFPair::getNone();
4813 
4814     break;
4815   }
4816 
4817   // The only loops we can vectorize without a scalar epilogue, are loops with
4818   // a bottom-test and a single exiting block. We'd have to handle the fact
4819   // that not every instruction executes on the last iteration.  This will
4820   // require a lane mask which varies through the vector loop body.  (TODO)
4821   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4822     // If there was a tail-folding hint/switch, but we can't fold the tail by
4823     // masking, fallback to a vectorization with a scalar epilogue.
4824     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4825       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4826                            "scalar epilogue instead.\n");
4827       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4828       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4829     }
4830     return FixedScalableVFPair::getNone();
4831   }
4832 
4833   // Now try the tail folding
4834 
4835   // Invalidate interleave groups that require an epilogue if we can't mask
4836   // the interleave-group.
4837   if (!useMaskedInterleavedAccesses(TTI)) {
4838     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4839            "No decisions should have been taken at this point");
4840     // Note: There is no need to invalidate any cost modeling decisions here, as
4841     // non where taken so far.
4842     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4843   }
4844 
4845   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4846 
4847   // Avoid tail folding if the trip count is known to be a multiple of any VF
4848   // we choose.
4849   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4850       MaxFactors.FixedVF.getFixedValue();
4851   if (MaxFactors.ScalableVF) {
4852     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4853     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4854       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4855           *MaxPowerOf2RuntimeVF,
4856           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4857     } else
4858       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4859   }
4860 
4861   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4862     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4863            "MaxFixedVF must be a power of 2");
4864     unsigned MaxVFtimesIC =
4865         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4866     ScalarEvolution *SE = PSE.getSE();
4867     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4868     const SCEV *ExitCount = SE->getAddExpr(
4869         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4870     const SCEV *Rem = SE->getURemExpr(
4871         SE->applyLoopGuards(ExitCount, TheLoop),
4872         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4873     if (Rem->isZero()) {
4874       // Accept MaxFixedVF if we do not have a tail.
4875       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4876       return MaxFactors;
4877     }
4878   }
4879 
4880   // If we don't know the precise trip count, or if the trip count that we
4881   // found modulo the vectorization factor is not zero, try to fold the tail
4882   // by masking.
4883   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4884   if (Legal->prepareToFoldTailByMasking()) {
4885     CanFoldTailByMasking = true;
4886     return MaxFactors;
4887   }
4888 
4889   // If there was a tail-folding hint/switch, but we can't fold the tail by
4890   // masking, fallback to a vectorization with a scalar epilogue.
4891   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4892     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4893                          "scalar epilogue instead.\n");
4894     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4895     return MaxFactors;
4896   }
4897 
4898   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4899     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4900     return FixedScalableVFPair::getNone();
4901   }
4902 
4903   if (TC == 0) {
4904     reportVectorizationFailure(
4905         "Unable to calculate the loop count due to complex control flow",
4906         "unable to calculate the loop count due to complex control flow",
4907         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4908     return FixedScalableVFPair::getNone();
4909   }
4910 
4911   reportVectorizationFailure(
4912       "Cannot optimize for size and vectorize at the same time.",
4913       "cannot optimize for size and vectorize at the same time. "
4914       "Enable vectorization of this loop with '#pragma clang loop "
4915       "vectorize(enable)' when compiling with -Os/-Oz",
4916       "NoTailLoopWithOptForSize", ORE, TheLoop);
4917   return FixedScalableVFPair::getNone();
4918 }
4919 
4920 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4921     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4922     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4923   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4924   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4925       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4926                            : TargetTransformInfo::RGK_FixedWidthVector);
4927 
4928   // Convenience function to return the minimum of two ElementCounts.
4929   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4930     assert((LHS.isScalable() == RHS.isScalable()) &&
4931            "Scalable flags must match");
4932     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4933   };
4934 
4935   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4936   // Note that both WidestRegister and WidestType may not be a powers of 2.
4937   auto MaxVectorElementCount = ElementCount::get(
4938       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4939       ComputeScalableMaxVF);
4940   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4941   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4942                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4943 
4944   if (!MaxVectorElementCount) {
4945     LLVM_DEBUG(dbgs() << "LV: The target has no "
4946                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4947                       << " vector registers.\n");
4948     return ElementCount::getFixed(1);
4949   }
4950 
4951   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4952   if (MaxVectorElementCount.isScalable() &&
4953       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4954     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4955     auto Min = Attr.getVScaleRangeMin();
4956     WidestRegisterMinEC *= Min;
4957   }
4958 
4959   // When a scalar epilogue is required, at least one iteration of the scalar
4960   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4961   // max VF that results in a dead vector loop.
4962   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4963     MaxTripCount -= 1;
4964 
4965   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4966       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4967     // If upper bound loop trip count (TC) is known at compile time there is no
4968     // point in choosing VF greater than TC (as done in the loop below). Select
4969     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4970     // scalable, we only fall back on a fixed VF when the TC is less than or
4971     // equal to the known number of lanes.
4972     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4973     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4974                          "exceeding the constant trip count: "
4975                       << ClampedUpperTripCount << "\n");
4976     return ElementCount::get(
4977         ClampedUpperTripCount,
4978         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4979   }
4980 
4981   TargetTransformInfo::RegisterKind RegKind =
4982       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4983                            : TargetTransformInfo::RGK_FixedWidthVector;
4984   ElementCount MaxVF = MaxVectorElementCount;
4985   if (MaximizeBandwidth ||
4986       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4987        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4988         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4989     auto MaxVectorElementCountMaxBW = ElementCount::get(
4990         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4991         ComputeScalableMaxVF);
4992     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4993 
4994     // Collect all viable vectorization factors larger than the default MaxVF
4995     // (i.e. MaxVectorElementCount).
4996     SmallVector<ElementCount, 8> VFs;
4997     for (ElementCount VS = MaxVectorElementCount * 2;
4998          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4999       VFs.push_back(VS);
5000 
5001     // For each VF calculate its register usage.
5002     auto RUs = calculateRegisterUsage(VFs);
5003 
5004     // Select the largest VF which doesn't require more registers than existing
5005     // ones.
5006     for (int i = RUs.size() - 1; i >= 0; --i) {
5007       bool Selected = true;
5008       for (auto &pair : RUs[i].MaxLocalUsers) {
5009         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5010         if (pair.second > TargetNumRegisters)
5011           Selected = false;
5012       }
5013       if (Selected) {
5014         MaxVF = VFs[i];
5015         break;
5016       }
5017     }
5018     if (ElementCount MinVF =
5019             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5020       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5021         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5022                           << ") with target's minimum: " << MinVF << '\n');
5023         MaxVF = MinVF;
5024       }
5025     }
5026 
5027     // Invalidate any widening decisions we might have made, in case the loop
5028     // requires prediction (decided later), but we have already made some
5029     // load/store widening decisions.
5030     invalidateCostModelingDecisions();
5031   }
5032   return MaxVF;
5033 }
5034 
5035 /// Convenience function that returns the value of vscale_range iff
5036 /// vscale_range.min == vscale_range.max or otherwise returns the value
5037 /// returned by the corresponding TTI method.
5038 static std::optional<unsigned>
5039 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
5040   const Function *Fn = L->getHeader()->getParent();
5041   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
5042     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
5043     auto Min = Attr.getVScaleRangeMin();
5044     auto Max = Attr.getVScaleRangeMax();
5045     if (Max && Min == Max)
5046       return Max;
5047   }
5048 
5049   return TTI.getVScaleForTuning();
5050 }
5051 
5052 bool LoopVectorizationPlanner::isMoreProfitable(
5053     const VectorizationFactor &A, const VectorizationFactor &B) const {
5054   InstructionCost CostA = A.Cost;
5055   InstructionCost CostB = B.Cost;
5056 
5057   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
5058 
5059   if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
5060     // If the trip count is a known (possibly small) constant, the trip count
5061     // will be rounded up to an integer number of iterations under
5062     // FoldTailByMasking. The total cost in that case will be
5063     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
5064     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
5065     // some extra overheads, but for the purpose of comparing the costs of
5066     // different VFs we can use this to compare the total loop-body cost
5067     // expected after vectorization.
5068     auto GetCostForTC = [MaxTripCount, this](unsigned VF,
5069                                              InstructionCost VectorCost,
5070                                              InstructionCost ScalarCost) {
5071       return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
5072                                     : VectorCost * (MaxTripCount / VF) +
5073                                           ScalarCost * (MaxTripCount % VF);
5074     };
5075     auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
5076     auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
5077 
5078     return RTCostA < RTCostB;
5079   }
5080 
5081   // Improve estimate for the vector width if it is scalable.
5082   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5083   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5084   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
5085     if (A.Width.isScalable())
5086       EstimatedWidthA *= *VScale;
5087     if (B.Width.isScalable())
5088       EstimatedWidthB *= *VScale;
5089   }
5090 
5091   // Assume vscale may be larger than 1 (or the value being tuned for),
5092   // so that scalable vectorization is slightly favorable over fixed-width
5093   // vectorization.
5094   if (A.Width.isScalable() && !B.Width.isScalable())
5095     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5096 
5097   // To avoid the need for FP division:
5098   //      (CostA / A.Width) < (CostB / B.Width)
5099   // <=>  (CostA * B.Width) < (CostB * A.Width)
5100   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5101 }
5102 
5103 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
5104                                    OptimizationRemarkEmitter *ORE,
5105                                    Loop *TheLoop) {
5106   if (InvalidCosts.empty())
5107     return;
5108 
5109   // Emit a report of VFs with invalid costs in the loop.
5110 
5111   // Group the remarks per instruction, keeping the instruction order from
5112   // InvalidCosts.
5113   std::map<Instruction *, unsigned> Numbering;
5114   unsigned I = 0;
5115   for (auto &Pair : InvalidCosts)
5116     if (!Numbering.count(Pair.first))
5117       Numbering[Pair.first] = I++;
5118 
5119   // Sort the list, first on instruction(number) then on VF.
5120   sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5121     if (Numbering[A.first] != Numbering[B.first])
5122       return Numbering[A.first] < Numbering[B.first];
5123     ElementCountComparator ECC;
5124     return ECC(A.second, B.second);
5125   });
5126 
5127   // For a list of ordered instruction-vf pairs:
5128   //   [(load, vf1), (load, vf2), (store, vf1)]
5129   // Group the instructions together to emit separate remarks for:
5130   //   load  (vf1, vf2)
5131   //   store (vf1)
5132   auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5133   auto Subset = ArrayRef<InstructionVFPair>();
5134   do {
5135     if (Subset.empty())
5136       Subset = Tail.take_front(1);
5137 
5138     Instruction *I = Subset.front().first;
5139 
5140     // If the next instruction is different, or if there are no other pairs,
5141     // emit a remark for the collated subset. e.g.
5142     //   [(load, vf1), (load, vf2))]
5143     // to emit:
5144     //  remark: invalid costs for 'load' at VF=(vf, vf2)
5145     if (Subset == Tail || Tail[Subset.size()].first != I) {
5146       std::string OutString;
5147       raw_string_ostream OS(OutString);
5148       assert(!Subset.empty() && "Unexpected empty range");
5149       OS << "Instruction with invalid costs prevented vectorization at VF=(";
5150       for (const auto &Pair : Subset)
5151         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
5152       OS << "):";
5153       if (auto *CI = dyn_cast<CallInst>(I))
5154         OS << " call to " << CI->getCalledFunction()->getName();
5155       else
5156         OS << " " << I->getOpcodeName();
5157       OS.flush();
5158       reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5159       Tail = Tail.drop_front(Subset.size());
5160       Subset = {};
5161     } else
5162       // Grow the subset by one element
5163       Subset = Tail.take_front(Subset.size() + 1);
5164   } while (!Tail.empty());
5165 }
5166 
5167 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
5168     const ElementCountSet &VFCandidates) {
5169   InstructionCost ExpectedCost =
5170       CM.expectedCost(ElementCount::getFixed(1)).first;
5171   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5172   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5173   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5174          "Expected Scalar VF to be a candidate");
5175 
5176   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5177                                        ExpectedCost);
5178   VectorizationFactor ChosenFactor = ScalarCost;
5179 
5180   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
5181   if (ForceVectorization && VFCandidates.size() > 1) {
5182     // Ignore scalar width, because the user explicitly wants vectorization.
5183     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5184     // evaluation.
5185     ChosenFactor.Cost = InstructionCost::getMax();
5186   }
5187 
5188   SmallVector<InstructionVFPair> InvalidCosts;
5189   for (const auto &i : VFCandidates) {
5190     // The cost for scalar VF=1 is already calculated, so ignore it.
5191     if (i.isScalar())
5192       continue;
5193 
5194     LoopVectorizationCostModel::VectorizationCostTy C =
5195         CM.expectedCost(i, &InvalidCosts);
5196     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5197 
5198 #ifndef NDEBUG
5199     unsigned AssumedMinimumVscale = 1;
5200     if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5201       AssumedMinimumVscale = *VScale;
5202     unsigned Width =
5203         Candidate.Width.isScalable()
5204             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5205             : Candidate.Width.getFixedValue();
5206     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5207                       << " costs: " << (Candidate.Cost / Width));
5208     if (i.isScalable())
5209       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5210                         << AssumedMinimumVscale << ")");
5211     LLVM_DEBUG(dbgs() << ".\n");
5212 #endif
5213 
5214     if (!C.second && !ForceVectorization) {
5215       LLVM_DEBUG(
5216           dbgs() << "LV: Not considering vector loop of width " << i
5217                  << " because it will not generate any vector instructions.\n");
5218       continue;
5219     }
5220 
5221     // If profitable add it to ProfitableVF list.
5222     if (isMoreProfitable(Candidate, ScalarCost))
5223       ProfitableVFs.push_back(Candidate);
5224 
5225     if (isMoreProfitable(Candidate, ChosenFactor))
5226       ChosenFactor = Candidate;
5227   }
5228 
5229   emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5230 
5231   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5232     reportVectorizationFailure(
5233         "There are conditional stores.",
5234         "store that is conditionally executed prevents vectorization",
5235         "ConditionalStore", ORE, OrigLoop);
5236     ChosenFactor = ScalarCost;
5237   }
5238 
5239   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5240                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5241              << "LV: Vectorization seems to be not beneficial, "
5242              << "but was forced by a user.\n");
5243   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5244   return ChosenFactor;
5245 }
5246 
5247 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5248     ElementCount VF) const {
5249   // Cross iteration phis such as reductions need special handling and are
5250   // currently unsupported.
5251   if (any_of(OrigLoop->getHeader()->phis(),
5252              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5253     return false;
5254 
5255   // Phis with uses outside of the loop require special handling and are
5256   // currently unsupported.
5257   for (const auto &Entry : Legal->getInductionVars()) {
5258     // Look for uses of the value of the induction at the last iteration.
5259     Value *PostInc =
5260         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5261     for (User *U : PostInc->users())
5262       if (!OrigLoop->contains(cast<Instruction>(U)))
5263         return false;
5264     // Look for uses of penultimate value of the induction.
5265     for (User *U : Entry.first->users())
5266       if (!OrigLoop->contains(cast<Instruction>(U)))
5267         return false;
5268   }
5269 
5270   // Epilogue vectorization code has not been auditted to ensure it handles
5271   // non-latch exits properly.  It may be fine, but it needs auditted and
5272   // tested.
5273   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5274     return false;
5275 
5276   return true;
5277 }
5278 
5279 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5280     const ElementCount VF) const {
5281   // FIXME: We need a much better cost-model to take different parameters such
5282   // as register pressure, code size increase and cost of extra branches into
5283   // account. For now we apply a very crude heuristic and only consider loops
5284   // with vectorization factors larger than a certain value.
5285 
5286   // Allow the target to opt out entirely.
5287   if (!TTI.preferEpilogueVectorization())
5288     return false;
5289 
5290   // We also consider epilogue vectorization unprofitable for targets that don't
5291   // consider interleaving beneficial (eg. MVE).
5292   if (TTI.getMaxInterleaveFactor(VF) <= 1)
5293     return false;
5294 
5295   unsigned Multiplier = 1;
5296   if (VF.isScalable())
5297     Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5298   if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5299     return true;
5300   return false;
5301 }
5302 
5303 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5304     const ElementCount MainLoopVF, unsigned IC) {
5305   VectorizationFactor Result = VectorizationFactor::Disabled();
5306   if (!EnableEpilogueVectorization) {
5307     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5308     return Result;
5309   }
5310 
5311   if (!CM.isScalarEpilogueAllowed()) {
5312     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5313                          "epilogue is allowed.\n");
5314     return Result;
5315   }
5316 
5317   // Not really a cost consideration, but check for unsupported cases here to
5318   // simplify the logic.
5319   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5320     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5321                          "is not a supported candidate.\n");
5322     return Result;
5323   }
5324 
5325   if (EpilogueVectorizationForceVF > 1) {
5326     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5327     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5328     if (hasPlanWithVF(ForcedEC))
5329       return {ForcedEC, 0, 0};
5330     else {
5331       LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5332                            "viable.\n");
5333       return Result;
5334     }
5335   }
5336 
5337   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5338       OrigLoop->getHeader()->getParent()->hasMinSize()) {
5339     LLVM_DEBUG(
5340         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5341     return Result;
5342   }
5343 
5344   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5345     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5346                          "this loop\n");
5347     return Result;
5348   }
5349 
5350   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5351   // the main loop handles 8 lanes per iteration. We could still benefit from
5352   // vectorizing the epilogue loop with VF=4.
5353   ElementCount EstimatedRuntimeVF = MainLoopVF;
5354   if (MainLoopVF.isScalable()) {
5355     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5356     if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5357       EstimatedRuntimeVF *= *VScale;
5358   }
5359 
5360   ScalarEvolution &SE = *PSE.getSE();
5361   Type *TCType = Legal->getWidestInductionType();
5362   const SCEV *RemainingIterations = nullptr;
5363   for (auto &NextVF : ProfitableVFs) {
5364     // Skip candidate VFs without a corresponding VPlan.
5365     if (!hasPlanWithVF(NextVF.Width))
5366       continue;
5367 
5368     // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5369     // vectors) or the VF of the main loop (fixed vectors).
5370     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5371          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5372         ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5373       continue;
5374 
5375     // If NextVF is greater than the number of remaining iterations, the
5376     // epilogue loop would be dead. Skip such factors.
5377     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5378       // TODO: extend to support scalable VFs.
5379       if (!RemainingIterations) {
5380         const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5381         RemainingIterations = SE.getURemExpr(
5382             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5383       }
5384       if (SE.isKnownPredicate(
5385               CmpInst::ICMP_UGT,
5386               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5387               RemainingIterations))
5388         continue;
5389     }
5390 
5391     if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5392       Result = NextVF;
5393   }
5394 
5395   if (Result != VectorizationFactor::Disabled())
5396     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5397                       << Result.Width << "\n");
5398   return Result;
5399 }
5400 
5401 std::pair<unsigned, unsigned>
5402 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5403   unsigned MinWidth = -1U;
5404   unsigned MaxWidth = 8;
5405   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5406   // For in-loop reductions, no element types are added to ElementTypesInLoop
5407   // if there are no loads/stores in the loop. In this case, check through the
5408   // reduction variables to determine the maximum width.
5409   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5410     // Reset MaxWidth so that we can find the smallest type used by recurrences
5411     // in the loop.
5412     MaxWidth = -1U;
5413     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5414       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5415       // When finding the min width used by the recurrence we need to account
5416       // for casts on the input operands of the recurrence.
5417       MaxWidth = std::min<unsigned>(
5418           MaxWidth, std::min<unsigned>(
5419                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5420                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5421     }
5422   } else {
5423     for (Type *T : ElementTypesInLoop) {
5424       MinWidth = std::min<unsigned>(
5425           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5426       MaxWidth = std::max<unsigned>(
5427           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5428     }
5429   }
5430   return {MinWidth, MaxWidth};
5431 }
5432 
5433 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5434   ElementTypesInLoop.clear();
5435   // For each block.
5436   for (BasicBlock *BB : TheLoop->blocks()) {
5437     // For each instruction in the loop.
5438     for (Instruction &I : BB->instructionsWithoutDebug()) {
5439       Type *T = I.getType();
5440 
5441       // Skip ignored values.
5442       if (ValuesToIgnore.count(&I))
5443         continue;
5444 
5445       // Only examine Loads, Stores and PHINodes.
5446       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5447         continue;
5448 
5449       // Examine PHI nodes that are reduction variables. Update the type to
5450       // account for the recurrence type.
5451       if (auto *PN = dyn_cast<PHINode>(&I)) {
5452         if (!Legal->isReductionVariable(PN))
5453           continue;
5454         const RecurrenceDescriptor &RdxDesc =
5455             Legal->getReductionVars().find(PN)->second;
5456         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5457             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5458                                       RdxDesc.getRecurrenceType(),
5459                                       TargetTransformInfo::ReductionFlags()))
5460           continue;
5461         T = RdxDesc.getRecurrenceType();
5462       }
5463 
5464       // Examine the stored values.
5465       if (auto *ST = dyn_cast<StoreInst>(&I))
5466         T = ST->getValueOperand()->getType();
5467 
5468       assert(T->isSized() &&
5469              "Expected the load/store/recurrence type to be sized");
5470 
5471       ElementTypesInLoop.insert(T);
5472     }
5473   }
5474 }
5475 
5476 unsigned
5477 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5478                                                   InstructionCost LoopCost) {
5479   // -- The interleave heuristics --
5480   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5481   // There are many micro-architectural considerations that we can't predict
5482   // at this level. For example, frontend pressure (on decode or fetch) due to
5483   // code size, or the number and capabilities of the execution ports.
5484   //
5485   // We use the following heuristics to select the interleave count:
5486   // 1. If the code has reductions, then we interleave to break the cross
5487   // iteration dependency.
5488   // 2. If the loop is really small, then we interleave to reduce the loop
5489   // overhead.
5490   // 3. We don't interleave if we think that we will spill registers to memory
5491   // due to the increased register pressure.
5492 
5493   if (!isScalarEpilogueAllowed())
5494     return 1;
5495 
5496   // We used the distance for the interleave count.
5497   if (!Legal->isSafeForAnyVectorWidth())
5498     return 1;
5499 
5500   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5501   const bool HasReductions = !Legal->getReductionVars().empty();
5502   // Do not interleave loops with a relatively small known or estimated trip
5503   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5504   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5505   // because with the above conditions interleaving can expose ILP and break
5506   // cross iteration dependences for reductions.
5507   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5508       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5509     return 1;
5510 
5511   // If we did not calculate the cost for VF (because the user selected the VF)
5512   // then we calculate the cost of VF here.
5513   if (LoopCost == 0) {
5514     LoopCost = expectedCost(VF).first;
5515     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5516 
5517     // Loop body is free and there is no need for interleaving.
5518     if (LoopCost == 0)
5519       return 1;
5520   }
5521 
5522   RegisterUsage R = calculateRegisterUsage({VF})[0];
5523   // We divide by these constants so assume that we have at least one
5524   // instruction that uses at least one register.
5525   for (auto& pair : R.MaxLocalUsers) {
5526     pair.second = std::max(pair.second, 1U);
5527   }
5528 
5529   // We calculate the interleave count using the following formula.
5530   // Subtract the number of loop invariants from the number of available
5531   // registers. These registers are used by all of the interleaved instances.
5532   // Next, divide the remaining registers by the number of registers that is
5533   // required by the loop, in order to estimate how many parallel instances
5534   // fit without causing spills. All of this is rounded down if necessary to be
5535   // a power of two. We want power of two interleave count to simplify any
5536   // addressing operations or alignment considerations.
5537   // We also want power of two interleave counts to ensure that the induction
5538   // variable of the vector loop wraps to zero, when tail is folded by masking;
5539   // this currently happens when OptForSize, in which case IC is set to 1 above.
5540   unsigned IC = UINT_MAX;
5541 
5542   for (auto& pair : R.MaxLocalUsers) {
5543     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5544     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5545                       << " registers of "
5546                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5547     if (VF.isScalar()) {
5548       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5549         TargetNumRegisters = ForceTargetNumScalarRegs;
5550     } else {
5551       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5552         TargetNumRegisters = ForceTargetNumVectorRegs;
5553     }
5554     unsigned MaxLocalUsers = pair.second;
5555     unsigned LoopInvariantRegs = 0;
5556     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5557       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5558 
5559     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5560                                      MaxLocalUsers);
5561     // Don't count the induction variable as interleaved.
5562     if (EnableIndVarRegisterHeur) {
5563       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5564                               std::max(1U, (MaxLocalUsers - 1)));
5565     }
5566 
5567     IC = std::min(IC, TmpIC);
5568   }
5569 
5570   // Clamp the interleave ranges to reasonable counts.
5571   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5572 
5573   // Check if the user has overridden the max.
5574   if (VF.isScalar()) {
5575     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5576       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5577   } else {
5578     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5579       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5580   }
5581 
5582   // If trip count is known or estimated compile time constant, limit the
5583   // interleave count to be less than the trip count divided by VF, provided it
5584   // is at least 1.
5585   //
5586   // For scalable vectors we can't know if interleaving is beneficial. It may
5587   // not be beneficial for small loops if none of the lanes in the second vector
5588   // iterations is enabled. However, for larger loops, there is likely to be a
5589   // similar benefit as for fixed-width vectors. For now, we choose to leave
5590   // the InterleaveCount as if vscale is '1', although if some information about
5591   // the vector is known (e.g. min vector size), we can make a better decision.
5592   if (BestKnownTC) {
5593     MaxInterleaveCount =
5594         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5595     // Make sure MaxInterleaveCount is greater than 0.
5596     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5597   }
5598 
5599   assert(MaxInterleaveCount > 0 &&
5600          "Maximum interleave count must be greater than 0");
5601 
5602   // Clamp the calculated IC to be between the 1 and the max interleave count
5603   // that the target and trip count allows.
5604   if (IC > MaxInterleaveCount)
5605     IC = MaxInterleaveCount;
5606   else
5607     // Make sure IC is greater than 0.
5608     IC = std::max(1u, IC);
5609 
5610   assert(IC > 0 && "Interleave count must be greater than 0.");
5611 
5612   // Interleave if we vectorized this loop and there is a reduction that could
5613   // benefit from interleaving.
5614   if (VF.isVector() && HasReductions) {
5615     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5616     return IC;
5617   }
5618 
5619   // For any scalar loop that either requires runtime checks or predication we
5620   // are better off leaving this to the unroller. Note that if we've already
5621   // vectorized the loop we will have done the runtime check and so interleaving
5622   // won't require further checks.
5623   bool ScalarInterleavingRequiresPredication =
5624       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5625          return Legal->blockNeedsPredication(BB);
5626        }));
5627   bool ScalarInterleavingRequiresRuntimePointerCheck =
5628       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5629 
5630   // We want to interleave small loops in order to reduce the loop overhead and
5631   // potentially expose ILP opportunities.
5632   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5633                     << "LV: IC is " << IC << '\n'
5634                     << "LV: VF is " << VF << '\n');
5635   const bool AggressivelyInterleaveReductions =
5636       TTI.enableAggressiveInterleaving(HasReductions);
5637   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5638       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5639     // We assume that the cost overhead is 1 and we use the cost model
5640     // to estimate the cost of the loop and interleave until the cost of the
5641     // loop overhead is about 5% of the cost of the loop.
5642     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5643                                         SmallLoopCost / *LoopCost.getValue()));
5644 
5645     // Interleave until store/load ports (estimated by max interleave count) are
5646     // saturated.
5647     unsigned NumStores = Legal->getNumStores();
5648     unsigned NumLoads = Legal->getNumLoads();
5649     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5650     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5651 
5652     // There is little point in interleaving for reductions containing selects
5653     // and compares when VF=1 since it may just create more overhead than it's
5654     // worth for loops with small trip counts. This is because we still have to
5655     // do the final reduction after the loop.
5656     bool HasSelectCmpReductions =
5657         HasReductions &&
5658         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5659           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5660           return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5661               RdxDesc.getRecurrenceKind());
5662         });
5663     if (HasSelectCmpReductions) {
5664       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5665       return 1;
5666     }
5667 
5668     // If we have a scalar reduction (vector reductions are already dealt with
5669     // by this point), we can increase the critical path length if the loop
5670     // we're interleaving is inside another loop. For tree-wise reductions
5671     // set the limit to 2, and for ordered reductions it's best to disable
5672     // interleaving entirely.
5673     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5674       bool HasOrderedReductions =
5675           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5676             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5677             return RdxDesc.isOrdered();
5678           });
5679       if (HasOrderedReductions) {
5680         LLVM_DEBUG(
5681             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5682         return 1;
5683       }
5684 
5685       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5686       SmallIC = std::min(SmallIC, F);
5687       StoresIC = std::min(StoresIC, F);
5688       LoadsIC = std::min(LoadsIC, F);
5689     }
5690 
5691     if (EnableLoadStoreRuntimeInterleave &&
5692         std::max(StoresIC, LoadsIC) > SmallIC) {
5693       LLVM_DEBUG(
5694           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5695       return std::max(StoresIC, LoadsIC);
5696     }
5697 
5698     // If there are scalar reductions and TTI has enabled aggressive
5699     // interleaving for reductions, we will interleave to expose ILP.
5700     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5701         AggressivelyInterleaveReductions) {
5702       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5703       // Interleave no less than SmallIC but not as aggressive as the normal IC
5704       // to satisfy the rare situation when resources are too limited.
5705       return std::max(IC / 2, SmallIC);
5706     } else {
5707       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5708       return SmallIC;
5709     }
5710   }
5711 
5712   // Interleave if this is a large loop (small loops are already dealt with by
5713   // this point) that could benefit from interleaving.
5714   if (AggressivelyInterleaveReductions) {
5715     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5716     return IC;
5717   }
5718 
5719   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5720   return 1;
5721 }
5722 
5723 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5724 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5725   // This function calculates the register usage by measuring the highest number
5726   // of values that are alive at a single location. Obviously, this is a very
5727   // rough estimation. We scan the loop in a topological order in order and
5728   // assign a number to each instruction. We use RPO to ensure that defs are
5729   // met before their users. We assume that each instruction that has in-loop
5730   // users starts an interval. We record every time that an in-loop value is
5731   // used, so we have a list of the first and last occurrences of each
5732   // instruction. Next, we transpose this data structure into a multi map that
5733   // holds the list of intervals that *end* at a specific location. This multi
5734   // map allows us to perform a linear search. We scan the instructions linearly
5735   // and record each time that a new interval starts, by placing it in a set.
5736   // If we find this value in the multi-map then we remove it from the set.
5737   // The max register usage is the maximum size of the set.
5738   // We also search for instructions that are defined outside the loop, but are
5739   // used inside the loop. We need this number separately from the max-interval
5740   // usage number because when we unroll, loop-invariant values do not take
5741   // more register.
5742   LoopBlocksDFS DFS(TheLoop);
5743   DFS.perform(LI);
5744 
5745   RegisterUsage RU;
5746 
5747   // Each 'key' in the map opens a new interval. The values
5748   // of the map are the index of the 'last seen' usage of the
5749   // instruction that is the key.
5750   using IntervalMap = DenseMap<Instruction *, unsigned>;
5751 
5752   // Maps instruction to its index.
5753   SmallVector<Instruction *, 64> IdxToInstr;
5754   // Marks the end of each interval.
5755   IntervalMap EndPoint;
5756   // Saves the list of instruction indices that are used in the loop.
5757   SmallPtrSet<Instruction *, 8> Ends;
5758   // Saves the list of values that are used in the loop but are defined outside
5759   // the loop (not including non-instruction values such as arguments and
5760   // constants).
5761   SmallSetVector<Instruction *, 8> LoopInvariants;
5762 
5763   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5764     for (Instruction &I : BB->instructionsWithoutDebug()) {
5765       IdxToInstr.push_back(&I);
5766 
5767       // Save the end location of each USE.
5768       for (Value *U : I.operands()) {
5769         auto *Instr = dyn_cast<Instruction>(U);
5770 
5771         // Ignore non-instruction values such as arguments, constants, etc.
5772         // FIXME: Might need some motivation why these values are ignored. If
5773         // for example an argument is used inside the loop it will increase the
5774         // register pressure (so shouldn't we add it to LoopInvariants).
5775         if (!Instr)
5776           continue;
5777 
5778         // If this instruction is outside the loop then record it and continue.
5779         if (!TheLoop->contains(Instr)) {
5780           LoopInvariants.insert(Instr);
5781           continue;
5782         }
5783 
5784         // Overwrite previous end points.
5785         EndPoint[Instr] = IdxToInstr.size();
5786         Ends.insert(Instr);
5787       }
5788     }
5789   }
5790 
5791   // Saves the list of intervals that end with the index in 'key'.
5792   using InstrList = SmallVector<Instruction *, 2>;
5793   DenseMap<unsigned, InstrList> TransposeEnds;
5794 
5795   // Transpose the EndPoints to a list of values that end at each index.
5796   for (auto &Interval : EndPoint)
5797     TransposeEnds[Interval.second].push_back(Interval.first);
5798 
5799   SmallPtrSet<Instruction *, 8> OpenIntervals;
5800   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5801   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5802 
5803   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5804 
5805   const auto &TTICapture = TTI;
5806   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5807     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5808       return 0;
5809     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5810   };
5811 
5812   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5813     Instruction *I = IdxToInstr[i];
5814 
5815     // Remove all of the instructions that end at this location.
5816     InstrList &List = TransposeEnds[i];
5817     for (Instruction *ToRemove : List)
5818       OpenIntervals.erase(ToRemove);
5819 
5820     // Ignore instructions that are never used within the loop.
5821     if (!Ends.count(I))
5822       continue;
5823 
5824     // Skip ignored values.
5825     if (ValuesToIgnore.count(I))
5826       continue;
5827 
5828     collectInLoopReductions();
5829 
5830     // For each VF find the maximum usage of registers.
5831     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5832       // Count the number of registers used, per register class, given all open
5833       // intervals.
5834       // Note that elements in this SmallMapVector will be default constructed
5835       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5836       // there is no previous entry for ClassID.
5837       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5838 
5839       if (VFs[j].isScalar()) {
5840         for (auto *Inst : OpenIntervals) {
5841           unsigned ClassID =
5842               TTI.getRegisterClassForType(false, Inst->getType());
5843           // FIXME: The target might use more than one register for the type
5844           // even in the scalar case.
5845           RegUsage[ClassID] += 1;
5846         }
5847       } else {
5848         collectUniformsAndScalars(VFs[j]);
5849         for (auto *Inst : OpenIntervals) {
5850           // Skip ignored values for VF > 1.
5851           if (VecValuesToIgnore.count(Inst))
5852             continue;
5853           if (isScalarAfterVectorization(Inst, VFs[j])) {
5854             unsigned ClassID =
5855                 TTI.getRegisterClassForType(false, Inst->getType());
5856             // FIXME: The target might use more than one register for the type
5857             // even in the scalar case.
5858             RegUsage[ClassID] += 1;
5859           } else {
5860             unsigned ClassID =
5861                 TTI.getRegisterClassForType(true, Inst->getType());
5862             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5863           }
5864         }
5865       }
5866 
5867       for (auto& pair : RegUsage) {
5868         auto &Entry = MaxUsages[j][pair.first];
5869         Entry = std::max(Entry, pair.second);
5870       }
5871     }
5872 
5873     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5874                       << OpenIntervals.size() << '\n');
5875 
5876     // Add the current instruction to the list of open intervals.
5877     OpenIntervals.insert(I);
5878   }
5879 
5880   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5881     // Note that elements in this SmallMapVector will be default constructed
5882     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5883     // there is no previous entry for ClassID.
5884     SmallMapVector<unsigned, unsigned, 4> Invariant;
5885 
5886     for (auto *Inst : LoopInvariants) {
5887       // FIXME: The target might use more than one register for the type
5888       // even in the scalar case.
5889       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5890         auto *I = cast<Instruction>(U);
5891         return TheLoop != LI->getLoopFor(I->getParent()) ||
5892                isScalarAfterVectorization(I, VFs[i]);
5893       });
5894 
5895       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5896       unsigned ClassID =
5897           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5898       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5899     }
5900 
5901     LLVM_DEBUG({
5902       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5903       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5904              << " item\n";
5905       for (const auto &pair : MaxUsages[i]) {
5906         dbgs() << "LV(REG): RegisterClass: "
5907                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5908                << " registers\n";
5909       }
5910       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5911              << " item\n";
5912       for (const auto &pair : Invariant) {
5913         dbgs() << "LV(REG): RegisterClass: "
5914                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5915                << " registers\n";
5916       }
5917     });
5918 
5919     RU.LoopInvariantRegs = Invariant;
5920     RU.MaxLocalUsers = MaxUsages[i];
5921     RUs[i] = RU;
5922   }
5923 
5924   return RUs;
5925 }
5926 
5927 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5928                                                            ElementCount VF) {
5929   // TODO: Cost model for emulated masked load/store is completely
5930   // broken. This hack guides the cost model to use an artificially
5931   // high enough value to practically disable vectorization with such
5932   // operations, except where previously deployed legality hack allowed
5933   // using very low cost values. This is to avoid regressions coming simply
5934   // from moving "masked load/store" check from legality to cost model.
5935   // Masked Load/Gather emulation was previously never allowed.
5936   // Limited number of Masked Store/Scatter emulation was allowed.
5937   assert((isPredicatedInst(I)) &&
5938          "Expecting a scalar emulated instruction");
5939   return isa<LoadInst>(I) ||
5940          (isa<StoreInst>(I) &&
5941           NumPredStores > NumberOfStoresToPredicate);
5942 }
5943 
5944 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5945   // If we aren't vectorizing the loop, or if we've already collected the
5946   // instructions to scalarize, there's nothing to do. Collection may already
5947   // have occurred if we have a user-selected VF and are now computing the
5948   // expected cost for interleaving.
5949   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5950     return;
5951 
5952   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5953   // not profitable to scalarize any instructions, the presence of VF in the
5954   // map will indicate that we've analyzed it already.
5955   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5956 
5957   PredicatedBBsAfterVectorization[VF].clear();
5958 
5959   // Find all the instructions that are scalar with predication in the loop and
5960   // determine if it would be better to not if-convert the blocks they are in.
5961   // If so, we also record the instructions to scalarize.
5962   for (BasicBlock *BB : TheLoop->blocks()) {
5963     if (!blockNeedsPredicationForAnyReason(BB))
5964       continue;
5965     for (Instruction &I : *BB)
5966       if (isScalarWithPredication(&I, VF)) {
5967         ScalarCostsTy ScalarCosts;
5968         // Do not apply discount if scalable, because that would lead to
5969         // invalid scalarization costs.
5970         // Do not apply discount logic if hacked cost is needed
5971         // for emulated masked memrefs.
5972         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
5973             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5974           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5975         // Remember that BB will remain after vectorization.
5976         PredicatedBBsAfterVectorization[VF].insert(BB);
5977       }
5978   }
5979 }
5980 
5981 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5982     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5983   assert(!isUniformAfterVectorization(PredInst, VF) &&
5984          "Instruction marked uniform-after-vectorization will be predicated");
5985 
5986   // Initialize the discount to zero, meaning that the scalar version and the
5987   // vector version cost the same.
5988   InstructionCost Discount = 0;
5989 
5990   // Holds instructions to analyze. The instructions we visit are mapped in
5991   // ScalarCosts. Those instructions are the ones that would be scalarized if
5992   // we find that the scalar version costs less.
5993   SmallVector<Instruction *, 8> Worklist;
5994 
5995   // Returns true if the given instruction can be scalarized.
5996   auto canBeScalarized = [&](Instruction *I) -> bool {
5997     // We only attempt to scalarize instructions forming a single-use chain
5998     // from the original predicated block that would otherwise be vectorized.
5999     // Although not strictly necessary, we give up on instructions we know will
6000     // already be scalar to avoid traversing chains that are unlikely to be
6001     // beneficial.
6002     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6003         isScalarAfterVectorization(I, VF))
6004       return false;
6005 
6006     // If the instruction is scalar with predication, it will be analyzed
6007     // separately. We ignore it within the context of PredInst.
6008     if (isScalarWithPredication(I, VF))
6009       return false;
6010 
6011     // If any of the instruction's operands are uniform after vectorization,
6012     // the instruction cannot be scalarized. This prevents, for example, a
6013     // masked load from being scalarized.
6014     //
6015     // We assume we will only emit a value for lane zero of an instruction
6016     // marked uniform after vectorization, rather than VF identical values.
6017     // Thus, if we scalarize an instruction that uses a uniform, we would
6018     // create uses of values corresponding to the lanes we aren't emitting code
6019     // for. This behavior can be changed by allowing getScalarValue to clone
6020     // the lane zero values for uniforms rather than asserting.
6021     for (Use &U : I->operands())
6022       if (auto *J = dyn_cast<Instruction>(U.get()))
6023         if (isUniformAfterVectorization(J, VF))
6024           return false;
6025 
6026     // Otherwise, we can scalarize the instruction.
6027     return true;
6028   };
6029 
6030   // Compute the expected cost discount from scalarizing the entire expression
6031   // feeding the predicated instruction. We currently only consider expressions
6032   // that are single-use instruction chains.
6033   Worklist.push_back(PredInst);
6034   while (!Worklist.empty()) {
6035     Instruction *I = Worklist.pop_back_val();
6036 
6037     // If we've already analyzed the instruction, there's nothing to do.
6038     if (ScalarCosts.contains(I))
6039       continue;
6040 
6041     // Compute the cost of the vector instruction. Note that this cost already
6042     // includes the scalarization overhead of the predicated instruction.
6043     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6044 
6045     // Compute the cost of the scalarized instruction. This cost is the cost of
6046     // the instruction as if it wasn't if-converted and instead remained in the
6047     // predicated block. We will scale this cost by block probability after
6048     // computing the scalarization overhead.
6049     InstructionCost ScalarCost =
6050         VF.getFixedValue() *
6051         getInstructionCost(I, ElementCount::getFixed(1)).first;
6052 
6053     // Compute the scalarization overhead of needed insertelement instructions
6054     // and phi nodes.
6055     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6056     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6057       ScalarCost += TTI.getScalarizationOverhead(
6058           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6059           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
6060           /*Extract*/ false, CostKind);
6061       ScalarCost +=
6062           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6063     }
6064 
6065     // Compute the scalarization overhead of needed extractelement
6066     // instructions. For each of the instruction's operands, if the operand can
6067     // be scalarized, add it to the worklist; otherwise, account for the
6068     // overhead.
6069     for (Use &U : I->operands())
6070       if (auto *J = dyn_cast<Instruction>(U.get())) {
6071         assert(VectorType::isValidElementType(J->getType()) &&
6072                "Instruction has non-scalar type");
6073         if (canBeScalarized(J))
6074           Worklist.push_back(J);
6075         else if (needsExtract(J, VF)) {
6076           ScalarCost += TTI.getScalarizationOverhead(
6077               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6078               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
6079               /*Extract*/ true, CostKind);
6080         }
6081       }
6082 
6083     // Scale the total scalar cost by block probability.
6084     ScalarCost /= getReciprocalPredBlockProb();
6085 
6086     // Compute the discount. A non-negative discount means the vector version
6087     // of the instruction costs more, and scalarizing would be beneficial.
6088     Discount += VectorCost - ScalarCost;
6089     ScalarCosts[I] = ScalarCost;
6090   }
6091 
6092   return Discount;
6093 }
6094 
6095 LoopVectorizationCostModel::VectorizationCostTy
6096 LoopVectorizationCostModel::expectedCost(
6097     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6098   VectorizationCostTy Cost;
6099 
6100   // For each block.
6101   for (BasicBlock *BB : TheLoop->blocks()) {
6102     VectorizationCostTy BlockCost;
6103 
6104     // For each instruction in the old loop.
6105     for (Instruction &I : BB->instructionsWithoutDebug()) {
6106       // Skip ignored values.
6107       if (ValuesToIgnore.count(&I) ||
6108           (VF.isVector() && VecValuesToIgnore.count(&I)))
6109         continue;
6110 
6111       VectorizationCostTy C = getInstructionCost(&I, VF);
6112 
6113       // Check if we should override the cost.
6114       if (C.first.isValid() &&
6115           ForceTargetInstructionCost.getNumOccurrences() > 0)
6116         C.first = InstructionCost(ForceTargetInstructionCost);
6117 
6118       // Keep a list of instructions with invalid costs.
6119       if (Invalid && !C.first.isValid())
6120         Invalid->emplace_back(&I, VF);
6121 
6122       BlockCost.first += C.first;
6123       BlockCost.second |= C.second;
6124       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6125                         << " for VF " << VF << " For instruction: " << I
6126                         << '\n');
6127     }
6128 
6129     // If we are vectorizing a predicated block, it will have been
6130     // if-converted. This means that the block's instructions (aside from
6131     // stores and instructions that may divide by zero) will now be
6132     // unconditionally executed. For the scalar case, we may not always execute
6133     // the predicated block, if it is an if-else block. Thus, scale the block's
6134     // cost by the probability of executing it. blockNeedsPredication from
6135     // Legal is used so as to not include all blocks in tail folded loops.
6136     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6137       BlockCost.first /= getReciprocalPredBlockProb();
6138 
6139     Cost.first += BlockCost.first;
6140     Cost.second |= BlockCost.second;
6141   }
6142 
6143   return Cost;
6144 }
6145 
6146 /// Gets Address Access SCEV after verifying that the access pattern
6147 /// is loop invariant except the induction variable dependence.
6148 ///
6149 /// This SCEV can be sent to the Target in order to estimate the address
6150 /// calculation cost.
6151 static const SCEV *getAddressAccessSCEV(
6152               Value *Ptr,
6153               LoopVectorizationLegality *Legal,
6154               PredicatedScalarEvolution &PSE,
6155               const Loop *TheLoop) {
6156 
6157   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6158   if (!Gep)
6159     return nullptr;
6160 
6161   // We are looking for a gep with all loop invariant indices except for one
6162   // which should be an induction variable.
6163   auto SE = PSE.getSE();
6164   unsigned NumOperands = Gep->getNumOperands();
6165   for (unsigned i = 1; i < NumOperands; ++i) {
6166     Value *Opd = Gep->getOperand(i);
6167     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6168         !Legal->isInductionVariable(Opd))
6169       return nullptr;
6170   }
6171 
6172   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6173   return PSE.getSCEV(Ptr);
6174 }
6175 
6176 InstructionCost
6177 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6178                                                         ElementCount VF) {
6179   assert(VF.isVector() &&
6180          "Scalarization cost of instruction implies vectorization.");
6181   if (VF.isScalable())
6182     return InstructionCost::getInvalid();
6183 
6184   Type *ValTy = getLoadStoreType(I);
6185   auto SE = PSE.getSE();
6186 
6187   unsigned AS = getLoadStoreAddressSpace(I);
6188   Value *Ptr = getLoadStorePointerOperand(I);
6189   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6190   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6191   //       that it is being called from this specific place.
6192 
6193   // Figure out whether the access is strided and get the stride value
6194   // if it's known in compile time
6195   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6196 
6197   // Get the cost of the scalar memory instruction and address computation.
6198   InstructionCost Cost =
6199       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6200 
6201   // Don't pass *I here, since it is scalar but will actually be part of a
6202   // vectorized loop where the user of it is a vectorized instruction.
6203   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6204   const Align Alignment = getLoadStoreAlignment(I);
6205   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6206                                                       ValTy->getScalarType(),
6207                                                       Alignment, AS, CostKind);
6208 
6209   // Get the overhead of the extractelement and insertelement instructions
6210   // we might create due to scalarization.
6211   Cost += getScalarizationOverhead(I, VF, CostKind);
6212 
6213   // If we have a predicated load/store, it will need extra i1 extracts and
6214   // conditional branches, but may not be executed for each vector lane. Scale
6215   // the cost by the probability of executing the predicated block.
6216   if (isPredicatedInst(I)) {
6217     Cost /= getReciprocalPredBlockProb();
6218 
6219     // Add the cost of an i1 extract and a branch
6220     auto *Vec_i1Ty =
6221         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6222     Cost += TTI.getScalarizationOverhead(
6223         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6224         /*Insert=*/false, /*Extract=*/true, CostKind);
6225     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6226 
6227     if (useEmulatedMaskMemRefHack(I, VF))
6228       // Artificially setting to a high enough value to practically disable
6229       // vectorization with such operations.
6230       Cost = 3000000;
6231   }
6232 
6233   return Cost;
6234 }
6235 
6236 InstructionCost
6237 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6238                                                     ElementCount VF) {
6239   Type *ValTy = getLoadStoreType(I);
6240   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6241   Value *Ptr = getLoadStorePointerOperand(I);
6242   unsigned AS = getLoadStoreAddressSpace(I);
6243   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6244   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6245 
6246   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6247          "Stride should be 1 or -1 for consecutive memory access");
6248   const Align Alignment = getLoadStoreAlignment(I);
6249   InstructionCost Cost = 0;
6250   if (Legal->isMaskRequired(I)) {
6251     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6252                                       CostKind);
6253   } else {
6254     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6255     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6256                                 CostKind, OpInfo, I);
6257   }
6258 
6259   bool Reverse = ConsecutiveStride < 0;
6260   if (Reverse)
6261     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6262                                std::nullopt, CostKind, 0);
6263   return Cost;
6264 }
6265 
6266 InstructionCost
6267 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6268                                                 ElementCount VF) {
6269   assert(Legal->isUniformMemOp(*I, VF));
6270 
6271   Type *ValTy = getLoadStoreType(I);
6272   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6273   const Align Alignment = getLoadStoreAlignment(I);
6274   unsigned AS = getLoadStoreAddressSpace(I);
6275   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6276   if (isa<LoadInst>(I)) {
6277     return TTI.getAddressComputationCost(ValTy) +
6278            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6279                                CostKind) +
6280            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6281   }
6282   StoreInst *SI = cast<StoreInst>(I);
6283 
6284   bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6285   return TTI.getAddressComputationCost(ValTy) +
6286          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6287                              CostKind) +
6288          (isLoopInvariantStoreValue
6289               ? 0
6290               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6291                                        CostKind, VF.getKnownMinValue() - 1));
6292 }
6293 
6294 InstructionCost
6295 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6296                                                  ElementCount VF) {
6297   Type *ValTy = getLoadStoreType(I);
6298   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6299   const Align Alignment = getLoadStoreAlignment(I);
6300   const Value *Ptr = getLoadStorePointerOperand(I);
6301 
6302   return TTI.getAddressComputationCost(VectorTy) +
6303          TTI.getGatherScatterOpCost(
6304              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6305              TargetTransformInfo::TCK_RecipThroughput, I);
6306 }
6307 
6308 InstructionCost
6309 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6310                                                    ElementCount VF) {
6311   Type *ValTy = getLoadStoreType(I);
6312   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6313   unsigned AS = getLoadStoreAddressSpace(I);
6314   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6315 
6316   auto Group = getInterleavedAccessGroup(I);
6317   assert(Group && "Fail to get an interleaved access group.");
6318 
6319   unsigned InterleaveFactor = Group->getFactor();
6320   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6321 
6322   // Holds the indices of existing members in the interleaved group.
6323   SmallVector<unsigned, 4> Indices;
6324   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6325     if (Group->getMember(IF))
6326       Indices.push_back(IF);
6327 
6328   // Calculate the cost of the whole interleaved group.
6329   bool UseMaskForGaps =
6330       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6331       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6332   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6333       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6334       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6335 
6336   if (Group->isReverse()) {
6337     // TODO: Add support for reversed masked interleaved access.
6338     assert(!Legal->isMaskRequired(I) &&
6339            "Reverse masked interleaved access not supported.");
6340     Cost += Group->getNumMembers() *
6341             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6342                                std::nullopt, CostKind, 0);
6343   }
6344   return Cost;
6345 }
6346 
6347 std::optional<InstructionCost>
6348 LoopVectorizationCostModel::getReductionPatternCost(
6349     Instruction *I, ElementCount VF, Type *Ty,
6350     TTI::TargetCostKind CostKind) const {
6351   using namespace llvm::PatternMatch;
6352   // Early exit for no inloop reductions
6353   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6354     return std::nullopt;
6355   auto *VectorTy = cast<VectorType>(Ty);
6356 
6357   // We are looking for a pattern of, and finding the minimal acceptable cost:
6358   //  reduce(mul(ext(A), ext(B))) or
6359   //  reduce(mul(A, B)) or
6360   //  reduce(ext(A)) or
6361   //  reduce(A).
6362   // The basic idea is that we walk down the tree to do that, finding the root
6363   // reduction instruction in InLoopReductionImmediateChains. From there we find
6364   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6365   // of the components. If the reduction cost is lower then we return it for the
6366   // reduction instruction and 0 for the other instructions in the pattern. If
6367   // it is not we return an invalid cost specifying the orignal cost method
6368   // should be used.
6369   Instruction *RetI = I;
6370   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6371     if (!RetI->hasOneUser())
6372       return std::nullopt;
6373     RetI = RetI->user_back();
6374   }
6375 
6376   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6377       RetI->user_back()->getOpcode() == Instruction::Add) {
6378     RetI = RetI->user_back();
6379   }
6380 
6381   // Test if the found instruction is a reduction, and if not return an invalid
6382   // cost specifying the parent to use the original cost modelling.
6383   if (!InLoopReductionImmediateChains.count(RetI))
6384     return std::nullopt;
6385 
6386   // Find the reduction this chain is a part of and calculate the basic cost of
6387   // the reduction on its own.
6388   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6389   Instruction *ReductionPhi = LastChain;
6390   while (!isa<PHINode>(ReductionPhi))
6391     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6392 
6393   const RecurrenceDescriptor &RdxDesc =
6394       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6395 
6396   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6397       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6398 
6399   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6400   // normal fmul instruction to the cost of the fadd reduction.
6401   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6402     BaseCost +=
6403         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6404 
6405   // If we're using ordered reductions then we can just return the base cost
6406   // here, since getArithmeticReductionCost calculates the full ordered
6407   // reduction cost when FP reassociation is not allowed.
6408   if (useOrderedReductions(RdxDesc))
6409     return BaseCost;
6410 
6411   // Get the operand that was not the reduction chain and match it to one of the
6412   // patterns, returning the better cost if it is found.
6413   Instruction *RedOp = RetI->getOperand(1) == LastChain
6414                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6415                            : dyn_cast<Instruction>(RetI->getOperand(1));
6416 
6417   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6418 
6419   Instruction *Op0, *Op1;
6420   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6421       match(RedOp,
6422             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6423       match(Op0, m_ZExtOrSExt(m_Value())) &&
6424       Op0->getOpcode() == Op1->getOpcode() &&
6425       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6426       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6427       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6428 
6429     // Matched reduce.add(ext(mul(ext(A), ext(B)))
6430     // Note that the extend opcodes need to all match, or if A==B they will have
6431     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6432     // which is equally fine.
6433     bool IsUnsigned = isa<ZExtInst>(Op0);
6434     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6435     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6436 
6437     InstructionCost ExtCost =
6438         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6439                              TTI::CastContextHint::None, CostKind, Op0);
6440     InstructionCost MulCost =
6441         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6442     InstructionCost Ext2Cost =
6443         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6444                              TTI::CastContextHint::None, CostKind, RedOp);
6445 
6446     InstructionCost RedCost = TTI.getMulAccReductionCost(
6447         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6448 
6449     if (RedCost.isValid() &&
6450         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6451       return I == RetI ? RedCost : 0;
6452   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6453              !TheLoop->isLoopInvariant(RedOp)) {
6454     // Matched reduce(ext(A))
6455     bool IsUnsigned = isa<ZExtInst>(RedOp);
6456     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6457     InstructionCost RedCost = TTI.getExtendedReductionCost(
6458         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6459         RdxDesc.getFastMathFlags(), CostKind);
6460 
6461     InstructionCost ExtCost =
6462         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6463                              TTI::CastContextHint::None, CostKind, RedOp);
6464     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6465       return I == RetI ? RedCost : 0;
6466   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6467              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6468     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6469         Op0->getOpcode() == Op1->getOpcode() &&
6470         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6471       bool IsUnsigned = isa<ZExtInst>(Op0);
6472       Type *Op0Ty = Op0->getOperand(0)->getType();
6473       Type *Op1Ty = Op1->getOperand(0)->getType();
6474       Type *LargestOpTy =
6475           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6476                                                                     : Op0Ty;
6477       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6478 
6479       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6480       // different sizes. We take the largest type as the ext to reduce, and add
6481       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6482       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6483           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6484           TTI::CastContextHint::None, CostKind, Op0);
6485       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6486           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6487           TTI::CastContextHint::None, CostKind, Op1);
6488       InstructionCost MulCost =
6489           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6490 
6491       InstructionCost RedCost = TTI.getMulAccReductionCost(
6492           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6493       InstructionCost ExtraExtCost = 0;
6494       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6495         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6496         ExtraExtCost = TTI.getCastInstrCost(
6497             ExtraExtOp->getOpcode(), ExtType,
6498             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6499             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6500       }
6501 
6502       if (RedCost.isValid() &&
6503           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6504         return I == RetI ? RedCost : 0;
6505     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6506       // Matched reduce.add(mul())
6507       InstructionCost MulCost =
6508           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6509 
6510       InstructionCost RedCost = TTI.getMulAccReductionCost(
6511           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6512 
6513       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6514         return I == RetI ? RedCost : 0;
6515     }
6516   }
6517 
6518   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6519 }
6520 
6521 InstructionCost
6522 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6523                                                      ElementCount VF) {
6524   // Calculate scalar cost only. Vectorization cost should be ready at this
6525   // moment.
6526   if (VF.isScalar()) {
6527     Type *ValTy = getLoadStoreType(I);
6528     const Align Alignment = getLoadStoreAlignment(I);
6529     unsigned AS = getLoadStoreAddressSpace(I);
6530 
6531     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6532     return TTI.getAddressComputationCost(ValTy) +
6533            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6534                                TTI::TCK_RecipThroughput, OpInfo, I);
6535   }
6536   return getWideningCost(I, VF);
6537 }
6538 
6539 LoopVectorizationCostModel::VectorizationCostTy
6540 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6541                                                ElementCount VF) {
6542   // If we know that this instruction will remain uniform, check the cost of
6543   // the scalar version.
6544   if (isUniformAfterVectorization(I, VF))
6545     VF = ElementCount::getFixed(1);
6546 
6547   if (VF.isVector() && isProfitableToScalarize(I, VF))
6548     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6549 
6550   // Forced scalars do not have any scalarization overhead.
6551   auto ForcedScalar = ForcedScalars.find(VF);
6552   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6553     auto InstSet = ForcedScalar->second;
6554     if (InstSet.count(I))
6555       return VectorizationCostTy(
6556           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6557            VF.getKnownMinValue()),
6558           false);
6559   }
6560 
6561   Type *VectorTy;
6562   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6563 
6564   bool TypeNotScalarized = false;
6565   if (VF.isVector() && VectorTy->isVectorTy()) {
6566     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6567       if (VF.isScalable())
6568         // <vscale x 1 x iN> is assumed to be profitable over iN because
6569         // scalable registers are a distinct register class from scalar ones.
6570         // If we ever find a target which wants to lower scalable vectors
6571         // back to scalars, we'll need to update this code to explicitly
6572         // ask TTI about the register class uses for each part.
6573         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6574       else
6575         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6576     } else
6577       C = InstructionCost::getInvalid();
6578   }
6579   return VectorizationCostTy(C, TypeNotScalarized);
6580 }
6581 
6582 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6583     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6584 
6585   // There is no mechanism yet to create a scalable scalarization loop,
6586   // so this is currently Invalid.
6587   if (VF.isScalable())
6588     return InstructionCost::getInvalid();
6589 
6590   if (VF.isScalar())
6591     return 0;
6592 
6593   InstructionCost Cost = 0;
6594   Type *RetTy = ToVectorTy(I->getType(), VF);
6595   if (!RetTy->isVoidTy() &&
6596       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6597     Cost += TTI.getScalarizationOverhead(
6598         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6599         /*Insert*/ true,
6600         /*Extract*/ false, CostKind);
6601 
6602   // Some targets keep addresses scalar.
6603   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6604     return Cost;
6605 
6606   // Some targets support efficient element stores.
6607   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6608     return Cost;
6609 
6610   // Collect operands to consider.
6611   CallInst *CI = dyn_cast<CallInst>(I);
6612   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6613 
6614   // Skip operands that do not require extraction/scalarization and do not incur
6615   // any overhead.
6616   SmallVector<Type *> Tys;
6617   for (auto *V : filterExtractingOperands(Ops, VF))
6618     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6619   return Cost + TTI.getOperandsScalarizationOverhead(
6620                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6621 }
6622 
6623 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6624   if (VF.isScalar())
6625     return;
6626   NumPredStores = 0;
6627   for (BasicBlock *BB : TheLoop->blocks()) {
6628     // For each instruction in the old loop.
6629     for (Instruction &I : *BB) {
6630       Value *Ptr =  getLoadStorePointerOperand(&I);
6631       if (!Ptr)
6632         continue;
6633 
6634       // TODO: We should generate better code and update the cost model for
6635       // predicated uniform stores. Today they are treated as any other
6636       // predicated store (see added test cases in
6637       // invariant-store-vectorization.ll).
6638       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6639         NumPredStores++;
6640 
6641       if (Legal->isUniformMemOp(I, VF)) {
6642         auto isLegalToScalarize = [&]() {
6643           if (!VF.isScalable())
6644             // Scalarization of fixed length vectors "just works".
6645             return true;
6646 
6647           // We have dedicated lowering for unpredicated uniform loads and
6648           // stores.  Note that even with tail folding we know that at least
6649           // one lane is active (i.e. generalized predication is not possible
6650           // here), and the logic below depends on this fact.
6651           if (!foldTailByMasking())
6652             return true;
6653 
6654           // For scalable vectors, a uniform memop load is always
6655           // uniform-by-parts  and we know how to scalarize that.
6656           if (isa<LoadInst>(I))
6657             return true;
6658 
6659           // A uniform store isn't neccessarily uniform-by-part
6660           // and we can't assume scalarization.
6661           auto &SI = cast<StoreInst>(I);
6662           return TheLoop->isLoopInvariant(SI.getValueOperand());
6663         };
6664 
6665         const InstructionCost GatherScatterCost =
6666           isLegalGatherOrScatter(&I, VF) ?
6667           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6668 
6669         // Load: Scalar load + broadcast
6670         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6671         // FIXME: This cost is a significant under-estimate for tail folded
6672         // memory ops.
6673         const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6674           getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6675 
6676         // Choose better solution for the current VF,  Note that Invalid
6677         // costs compare as maximumal large.  If both are invalid, we get
6678         // scalable invalid which signals a failure and a vectorization abort.
6679         if (GatherScatterCost < ScalarizationCost)
6680           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6681         else
6682           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6683         continue;
6684       }
6685 
6686       // We assume that widening is the best solution when possible.
6687       if (memoryInstructionCanBeWidened(&I, VF)) {
6688         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6689         int ConsecutiveStride = Legal->isConsecutivePtr(
6690             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6691         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6692                "Expected consecutive stride.");
6693         InstWidening Decision =
6694             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6695         setWideningDecision(&I, VF, Decision, Cost);
6696         continue;
6697       }
6698 
6699       // Choose between Interleaving, Gather/Scatter or Scalarization.
6700       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6701       unsigned NumAccesses = 1;
6702       if (isAccessInterleaved(&I)) {
6703         auto Group = getInterleavedAccessGroup(&I);
6704         assert(Group && "Fail to get an interleaved access group.");
6705 
6706         // Make one decision for the whole group.
6707         if (getWideningDecision(&I, VF) != CM_Unknown)
6708           continue;
6709 
6710         NumAccesses = Group->getNumMembers();
6711         if (interleavedAccessCanBeWidened(&I, VF))
6712           InterleaveCost = getInterleaveGroupCost(&I, VF);
6713       }
6714 
6715       InstructionCost GatherScatterCost =
6716           isLegalGatherOrScatter(&I, VF)
6717               ? getGatherScatterCost(&I, VF) * NumAccesses
6718               : InstructionCost::getInvalid();
6719 
6720       InstructionCost ScalarizationCost =
6721           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6722 
6723       // Choose better solution for the current VF,
6724       // write down this decision and use it during vectorization.
6725       InstructionCost Cost;
6726       InstWidening Decision;
6727       if (InterleaveCost <= GatherScatterCost &&
6728           InterleaveCost < ScalarizationCost) {
6729         Decision = CM_Interleave;
6730         Cost = InterleaveCost;
6731       } else if (GatherScatterCost < ScalarizationCost) {
6732         Decision = CM_GatherScatter;
6733         Cost = GatherScatterCost;
6734       } else {
6735         Decision = CM_Scalarize;
6736         Cost = ScalarizationCost;
6737       }
6738       // If the instructions belongs to an interleave group, the whole group
6739       // receives the same decision. The whole group receives the cost, but
6740       // the cost will actually be assigned to one instruction.
6741       if (auto Group = getInterleavedAccessGroup(&I))
6742         setWideningDecision(Group, VF, Decision, Cost);
6743       else
6744         setWideningDecision(&I, VF, Decision, Cost);
6745     }
6746   }
6747 
6748   // Make sure that any load of address and any other address computation
6749   // remains scalar unless there is gather/scatter support. This avoids
6750   // inevitable extracts into address registers, and also has the benefit of
6751   // activating LSR more, since that pass can't optimize vectorized
6752   // addresses.
6753   if (TTI.prefersVectorizedAddressing())
6754     return;
6755 
6756   // Start with all scalar pointer uses.
6757   SmallPtrSet<Instruction *, 8> AddrDefs;
6758   for (BasicBlock *BB : TheLoop->blocks())
6759     for (Instruction &I : *BB) {
6760       Instruction *PtrDef =
6761         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6762       if (PtrDef && TheLoop->contains(PtrDef) &&
6763           getWideningDecision(&I, VF) != CM_GatherScatter)
6764         AddrDefs.insert(PtrDef);
6765     }
6766 
6767   // Add all instructions used to generate the addresses.
6768   SmallVector<Instruction *, 4> Worklist;
6769   append_range(Worklist, AddrDefs);
6770   while (!Worklist.empty()) {
6771     Instruction *I = Worklist.pop_back_val();
6772     for (auto &Op : I->operands())
6773       if (auto *InstOp = dyn_cast<Instruction>(Op))
6774         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6775             AddrDefs.insert(InstOp).second)
6776           Worklist.push_back(InstOp);
6777   }
6778 
6779   for (auto *I : AddrDefs) {
6780     if (isa<LoadInst>(I)) {
6781       // Setting the desired widening decision should ideally be handled in
6782       // by cost functions, but since this involves the task of finding out
6783       // if the loaded register is involved in an address computation, it is
6784       // instead changed here when we know this is the case.
6785       InstWidening Decision = getWideningDecision(I, VF);
6786       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6787         // Scalarize a widened load of address.
6788         setWideningDecision(
6789             I, VF, CM_Scalarize,
6790             (VF.getKnownMinValue() *
6791              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6792       else if (auto Group = getInterleavedAccessGroup(I)) {
6793         // Scalarize an interleave group of address loads.
6794         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6795           if (Instruction *Member = Group->getMember(I))
6796             setWideningDecision(
6797                 Member, VF, CM_Scalarize,
6798                 (VF.getKnownMinValue() *
6799                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6800         }
6801       }
6802     } else
6803       // Make sure I gets scalarized and a cost estimate without
6804       // scalarization overhead.
6805       ForcedScalars[VF].insert(I);
6806   }
6807 }
6808 
6809 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6810   assert(!VF.isScalar() &&
6811          "Trying to set a vectorization decision for a scalar VF");
6812 
6813   for (BasicBlock *BB : TheLoop->blocks()) {
6814     // For each instruction in the old loop.
6815     for (Instruction &I : *BB) {
6816       CallInst *CI = dyn_cast<CallInst>(&I);
6817 
6818       if (!CI)
6819         continue;
6820 
6821       InstructionCost ScalarCost = InstructionCost::getInvalid();
6822       InstructionCost VectorCost = InstructionCost::getInvalid();
6823       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6824       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6825 
6826       Function *ScalarFunc = CI->getCalledFunction();
6827       Type *ScalarRetTy = CI->getType();
6828       SmallVector<Type *, 4> Tys, ScalarTys;
6829       bool MaskRequired = Legal->isMaskRequired(CI);
6830       for (auto &ArgOp : CI->args())
6831         ScalarTys.push_back(ArgOp->getType());
6832 
6833       // Compute corresponding vector type for return value and arguments.
6834       Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6835       for (Type *ScalarTy : ScalarTys)
6836         Tys.push_back(ToVectorTy(ScalarTy, VF));
6837 
6838       // An in-loop reduction using an fmuladd intrinsic is a special case;
6839       // we don't want the normal cost for that intrinsic.
6840       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6841         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6842           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6843                                   getVectorIntrinsicIDForCall(CI, TLI),
6844                                   std::nullopt, *RedCost);
6845           continue;
6846         }
6847 
6848       // Estimate cost of scalarized vector call. The source operands are
6849       // assumed to be vectors, so we need to extract individual elements from
6850       // there, execute VF scalar calls, and then gather the result into the
6851       // vector return value.
6852       InstructionCost ScalarCallCost =
6853           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6854 
6855       // Compute costs of unpacking argument values for the scalar calls and
6856       // packing the return values to a vector.
6857       InstructionCost ScalarizationCost =
6858           getScalarizationOverhead(CI, VF, CostKind);
6859 
6860       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6861 
6862       // Find the cost of vectorizing the call, if we can find a suitable
6863       // vector variant of the function.
6864       bool UsesMask = false;
6865       VFInfo FuncInfo;
6866       Function *VecFunc = nullptr;
6867       // Search through any available variants for one we can use at this VF.
6868       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6869         // Must match requested VF.
6870         if (Info.Shape.VF != VF)
6871           continue;
6872 
6873         // Must take a mask argument if one is required
6874         if (MaskRequired && !Info.isMasked())
6875           continue;
6876 
6877         // Check that all parameter kinds are supported
6878         bool ParamsOk = true;
6879         for (VFParameter Param : Info.Shape.Parameters) {
6880           switch (Param.ParamKind) {
6881           case VFParamKind::Vector:
6882             break;
6883           case VFParamKind::OMP_Uniform: {
6884             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6885             // Make sure the scalar parameter in the loop is invariant.
6886             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6887                                               TheLoop))
6888               ParamsOk = false;
6889             break;
6890           }
6891           case VFParamKind::OMP_Linear: {
6892             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6893             // Find the stride for the scalar parameter in this loop and see if
6894             // it matches the stride for the variant.
6895             // TODO: do we need to figure out the cost of an extract to get the
6896             // first lane? Or do we hope that it will be folded away?
6897             ScalarEvolution *SE = PSE.getSE();
6898             const auto *SAR =
6899                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6900 
6901             if (!SAR || SAR->getLoop() != TheLoop) {
6902               ParamsOk = false;
6903               break;
6904             }
6905 
6906             const SCEVConstant *Step =
6907                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6908 
6909             if (!Step ||
6910                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6911               ParamsOk = false;
6912 
6913             break;
6914           }
6915           case VFParamKind::GlobalPredicate:
6916             UsesMask = true;
6917             break;
6918           default:
6919             ParamsOk = false;
6920             break;
6921           }
6922         }
6923 
6924         if (!ParamsOk)
6925           continue;
6926 
6927         // Found a suitable candidate, stop here.
6928         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6929         FuncInfo = Info;
6930         break;
6931       }
6932 
6933       // Add in the cost of synthesizing a mask if one wasn't required.
6934       InstructionCost MaskCost = 0;
6935       if (VecFunc && UsesMask && !MaskRequired)
6936         MaskCost = TTI.getShuffleCost(
6937             TargetTransformInfo::SK_Broadcast,
6938             VectorType::get(IntegerType::getInt1Ty(
6939                                 VecFunc->getFunctionType()->getContext()),
6940                             VF));
6941 
6942       if (TLI && VecFunc && !CI->isNoBuiltin())
6943         VectorCost =
6944             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6945 
6946       // Find the cost of an intrinsic; some targets may have instructions that
6947       // perform the operation without needing an actual call.
6948       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6949       if (IID != Intrinsic::not_intrinsic)
6950         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6951 
6952       InstructionCost Cost = ScalarCost;
6953       InstWidening Decision = CM_Scalarize;
6954 
6955       if (VectorCost <= Cost) {
6956         Cost = VectorCost;
6957         Decision = CM_VectorCall;
6958       }
6959 
6960       if (IntrinsicCost <= Cost) {
6961         Cost = IntrinsicCost;
6962         Decision = CM_IntrinsicCall;
6963       }
6964 
6965       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6966                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6967     }
6968   }
6969 }
6970 
6971 InstructionCost
6972 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6973                                                Type *&VectorTy) {
6974   Type *RetTy = I->getType();
6975   if (canTruncateToMinimalBitwidth(I, VF))
6976     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6977   auto SE = PSE.getSE();
6978   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6979 
6980   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6981                                                 ElementCount VF) -> bool {
6982     if (VF.isScalar())
6983       return true;
6984 
6985     auto Scalarized = InstsToScalarize.find(VF);
6986     assert(Scalarized != InstsToScalarize.end() &&
6987            "VF not yet analyzed for scalarization profitability");
6988     return !Scalarized->second.count(I) &&
6989            llvm::all_of(I->users(), [&](User *U) {
6990              auto *UI = cast<Instruction>(U);
6991              return !Scalarized->second.count(UI);
6992            });
6993   };
6994   (void) hasSingleCopyAfterVectorization;
6995 
6996   if (isScalarAfterVectorization(I, VF)) {
6997     // With the exception of GEPs and PHIs, after scalarization there should
6998     // only be one copy of the instruction generated in the loop. This is
6999     // because the VF is either 1, or any instructions that need scalarizing
7000     // have already been dealt with by the time we get here. As a result,
7001     // it means we don't have to multiply the instruction cost by VF.
7002     assert(I->getOpcode() == Instruction::GetElementPtr ||
7003            I->getOpcode() == Instruction::PHI ||
7004            (I->getOpcode() == Instruction::BitCast &&
7005             I->getType()->isPointerTy()) ||
7006            hasSingleCopyAfterVectorization(I, VF));
7007     VectorTy = RetTy;
7008   } else
7009     VectorTy = ToVectorTy(RetTy, VF);
7010 
7011   // TODO: We need to estimate the cost of intrinsic calls.
7012   switch (I->getOpcode()) {
7013   case Instruction::GetElementPtr:
7014     // We mark this instruction as zero-cost because the cost of GEPs in
7015     // vectorized code depends on whether the corresponding memory instruction
7016     // is scalarized or not. Therefore, we handle GEPs with the memory
7017     // instruction cost.
7018     return 0;
7019   case Instruction::Br: {
7020     // In cases of scalarized and predicated instructions, there will be VF
7021     // predicated blocks in the vectorized loop. Each branch around these
7022     // blocks requires also an extract of its vector compare i1 element.
7023     bool ScalarPredicatedBB = false;
7024     BranchInst *BI = cast<BranchInst>(I);
7025     if (VF.isVector() && BI->isConditional() &&
7026         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
7027          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
7028       ScalarPredicatedBB = true;
7029 
7030     if (ScalarPredicatedBB) {
7031       // Not possible to scalarize scalable vector with predicated instructions.
7032       if (VF.isScalable())
7033         return InstructionCost::getInvalid();
7034       // Return cost for branches around scalarized and predicated blocks.
7035       auto *Vec_i1Ty =
7036           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7037       return (
7038           TTI.getScalarizationOverhead(
7039               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
7040               /*Insert*/ false, /*Extract*/ true, CostKind) +
7041           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7042     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7043       // The back-edge branch will remain, as will all scalar branches.
7044       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7045     else
7046       // This branch will be eliminated by if-conversion.
7047       return 0;
7048     // Note: We currently assume zero cost for an unconditional branch inside
7049     // a predicated block since it will become a fall-through, although we
7050     // may decide in the future to call TTI for all branches.
7051   }
7052   case Instruction::PHI: {
7053     auto *Phi = cast<PHINode>(I);
7054 
7055     // First-order recurrences are replaced by vector shuffles inside the loop.
7056     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
7057       SmallVector<int> Mask(VF.getKnownMinValue());
7058       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
7059       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
7060                                 cast<VectorType>(VectorTy), Mask, CostKind,
7061                                 VF.getKnownMinValue() - 1);
7062     }
7063 
7064     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7065     // converted into select instructions. We require N - 1 selects per phi
7066     // node, where N is the number of incoming values.
7067     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7068       return (Phi->getNumIncomingValues() - 1) *
7069              TTI.getCmpSelInstrCost(
7070                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7071                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7072                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7073 
7074     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7075   }
7076   case Instruction::UDiv:
7077   case Instruction::SDiv:
7078   case Instruction::URem:
7079   case Instruction::SRem:
7080     if (VF.isVector() && isPredicatedInst(I)) {
7081       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
7082       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
7083         ScalarCost : SafeDivisorCost;
7084     }
7085     // We've proven all lanes safe to speculate, fall through.
7086     [[fallthrough]];
7087   case Instruction::Add:
7088   case Instruction::FAdd:
7089   case Instruction::Sub:
7090   case Instruction::FSub:
7091   case Instruction::Mul:
7092   case Instruction::FMul:
7093   case Instruction::FDiv:
7094   case Instruction::FRem:
7095   case Instruction::Shl:
7096   case Instruction::LShr:
7097   case Instruction::AShr:
7098   case Instruction::And:
7099   case Instruction::Or:
7100   case Instruction::Xor: {
7101     // If we're speculating on the stride being 1, the multiplication may
7102     // fold away.  We can generalize this for all operations using the notion
7103     // of neutral elements.  (TODO)
7104     if (I->getOpcode() == Instruction::Mul &&
7105         (PSE.getSCEV(I->getOperand(0))->isOne() ||
7106          PSE.getSCEV(I->getOperand(1))->isOne()))
7107       return 0;
7108 
7109     // Detect reduction patterns
7110     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7111       return *RedCost;
7112 
7113     // Certain instructions can be cheaper to vectorize if they have a constant
7114     // second vector operand. One example of this are shifts on x86.
7115     Value *Op2 = I->getOperand(1);
7116     auto Op2Info = TTI.getOperandInfo(Op2);
7117     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
7118         Legal->isInvariant(Op2))
7119       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7120 
7121     SmallVector<const Value *, 4> Operands(I->operand_values());
7122     return TTI.getArithmeticInstrCost(
7123         I->getOpcode(), VectorTy, CostKind,
7124         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7125         Op2Info, Operands, I);
7126   }
7127   case Instruction::FNeg: {
7128     return TTI.getArithmeticInstrCost(
7129         I->getOpcode(), VectorTy, CostKind,
7130         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7131         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7132         I->getOperand(0), I);
7133   }
7134   case Instruction::Select: {
7135     SelectInst *SI = cast<SelectInst>(I);
7136     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7137     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7138 
7139     const Value *Op0, *Op1;
7140     using namespace llvm::PatternMatch;
7141     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7142                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7143       // select x, y, false --> x & y
7144       // select x, true, y --> x | y
7145       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7146       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7147       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7148               Op1->getType()->getScalarSizeInBits() == 1);
7149 
7150       SmallVector<const Value *, 2> Operands{Op0, Op1};
7151       return TTI.getArithmeticInstrCost(
7152           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7153           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7154     }
7155 
7156     Type *CondTy = SI->getCondition()->getType();
7157     if (!ScalarCond)
7158       CondTy = VectorType::get(CondTy, VF);
7159 
7160     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7161     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7162       Pred = Cmp->getPredicate();
7163     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7164                                   CostKind, I);
7165   }
7166   case Instruction::ICmp:
7167   case Instruction::FCmp: {
7168     Type *ValTy = I->getOperand(0)->getType();
7169     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7170     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7171       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7172     VectorTy = ToVectorTy(ValTy, VF);
7173     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7174                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7175                                   I);
7176   }
7177   case Instruction::Store:
7178   case Instruction::Load: {
7179     ElementCount Width = VF;
7180     if (Width.isVector()) {
7181       InstWidening Decision = getWideningDecision(I, Width);
7182       assert(Decision != CM_Unknown &&
7183              "CM decision should be taken at this point");
7184       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7185         return InstructionCost::getInvalid();
7186       if (Decision == CM_Scalarize)
7187         Width = ElementCount::getFixed(1);
7188     }
7189     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7190     return getMemoryInstructionCost(I, VF);
7191   }
7192   case Instruction::BitCast:
7193     if (I->getType()->isPointerTy())
7194       return 0;
7195     [[fallthrough]];
7196   case Instruction::ZExt:
7197   case Instruction::SExt:
7198   case Instruction::FPToUI:
7199   case Instruction::FPToSI:
7200   case Instruction::FPExt:
7201   case Instruction::PtrToInt:
7202   case Instruction::IntToPtr:
7203   case Instruction::SIToFP:
7204   case Instruction::UIToFP:
7205   case Instruction::Trunc:
7206   case Instruction::FPTrunc: {
7207     // Computes the CastContextHint from a Load/Store instruction.
7208     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7209       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7210              "Expected a load or a store!");
7211 
7212       if (VF.isScalar() || !TheLoop->contains(I))
7213         return TTI::CastContextHint::Normal;
7214 
7215       switch (getWideningDecision(I, VF)) {
7216       case LoopVectorizationCostModel::CM_GatherScatter:
7217         return TTI::CastContextHint::GatherScatter;
7218       case LoopVectorizationCostModel::CM_Interleave:
7219         return TTI::CastContextHint::Interleave;
7220       case LoopVectorizationCostModel::CM_Scalarize:
7221       case LoopVectorizationCostModel::CM_Widen:
7222         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7223                                         : TTI::CastContextHint::Normal;
7224       case LoopVectorizationCostModel::CM_Widen_Reverse:
7225         return TTI::CastContextHint::Reversed;
7226       case LoopVectorizationCostModel::CM_Unknown:
7227         llvm_unreachable("Instr did not go through cost modelling?");
7228       case LoopVectorizationCostModel::CM_VectorCall:
7229       case LoopVectorizationCostModel::CM_IntrinsicCall:
7230         llvm_unreachable_internal("Instr has invalid widening decision");
7231       }
7232 
7233       llvm_unreachable("Unhandled case!");
7234     };
7235 
7236     unsigned Opcode = I->getOpcode();
7237     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7238     // For Trunc, the context is the only user, which must be a StoreInst.
7239     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7240       if (I->hasOneUse())
7241         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7242           CCH = ComputeCCH(Store);
7243     }
7244     // For Z/Sext, the context is the operand, which must be a LoadInst.
7245     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7246              Opcode == Instruction::FPExt) {
7247       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7248         CCH = ComputeCCH(Load);
7249     }
7250 
7251     // We optimize the truncation of induction variables having constant
7252     // integer steps. The cost of these truncations is the same as the scalar
7253     // operation.
7254     if (isOptimizableIVTruncate(I, VF)) {
7255       auto *Trunc = cast<TruncInst>(I);
7256       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7257                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7258     }
7259 
7260     // Detect reduction patterns
7261     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7262       return *RedCost;
7263 
7264     Type *SrcScalarTy = I->getOperand(0)->getType();
7265     Type *SrcVecTy =
7266         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7267     if (canTruncateToMinimalBitwidth(I, VF)) {
7268       // This cast is going to be shrunk. This may remove the cast or it might
7269       // turn it into slightly different cast. For example, if MinBW == 16,
7270       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7271       //
7272       // Calculate the modified src and dest types.
7273       Type *MinVecTy = VectorTy;
7274       if (Opcode == Instruction::Trunc) {
7275         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7276         VectorTy =
7277             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7278       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7279         // Leave SrcVecTy unchanged - we only shrink the destination element
7280         // type.
7281         VectorTy =
7282             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7283       }
7284     }
7285 
7286     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7287   }
7288   case Instruction::Call:
7289     return getVectorCallCost(cast<CallInst>(I), VF);
7290   case Instruction::ExtractValue:
7291     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7292   case Instruction::Alloca:
7293     // We cannot easily widen alloca to a scalable alloca, as
7294     // the result would need to be a vector of pointers.
7295     if (VF.isScalable())
7296       return InstructionCost::getInvalid();
7297     [[fallthrough]];
7298   default:
7299     // This opcode is unknown. Assume that it is the same as 'mul'.
7300     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7301   } // end of switch.
7302 }
7303 
7304 void LoopVectorizationCostModel::collectValuesToIgnore() {
7305   // Ignore ephemeral values.
7306   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7307 
7308   // Find all stores to invariant variables. Since they are going to sink
7309   // outside the loop we do not need calculate cost for them.
7310   for (BasicBlock *BB : TheLoop->blocks())
7311     for (Instruction &I : *BB) {
7312       StoreInst *SI;
7313       if ((SI = dyn_cast<StoreInst>(&I)) &&
7314           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7315         ValuesToIgnore.insert(&I);
7316     }
7317 
7318   // Ignore type-promoting instructions we identified during reduction
7319   // detection.
7320   for (const auto &Reduction : Legal->getReductionVars()) {
7321     const RecurrenceDescriptor &RedDes = Reduction.second;
7322     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7323     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7324   }
7325   // Ignore type-casting instructions we identified during induction
7326   // detection.
7327   for (const auto &Induction : Legal->getInductionVars()) {
7328     const InductionDescriptor &IndDes = Induction.second;
7329     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7330     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7331   }
7332 }
7333 
7334 void LoopVectorizationCostModel::collectInLoopReductions() {
7335   for (const auto &Reduction : Legal->getReductionVars()) {
7336     PHINode *Phi = Reduction.first;
7337     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7338 
7339     // We don't collect reductions that are type promoted (yet).
7340     if (RdxDesc.getRecurrenceType() != Phi->getType())
7341       continue;
7342 
7343     // If the target would prefer this reduction to happen "in-loop", then we
7344     // want to record it as such.
7345     unsigned Opcode = RdxDesc.getOpcode();
7346     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7347         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7348                                    TargetTransformInfo::ReductionFlags()))
7349       continue;
7350 
7351     // Check that we can correctly put the reductions into the loop, by
7352     // finding the chain of operations that leads from the phi to the loop
7353     // exit value.
7354     SmallVector<Instruction *, 4> ReductionOperations =
7355         RdxDesc.getReductionOpChain(Phi, TheLoop);
7356     bool InLoop = !ReductionOperations.empty();
7357 
7358     if (InLoop) {
7359       InLoopReductions.insert(Phi);
7360       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7361       Instruction *LastChain = Phi;
7362       for (auto *I : ReductionOperations) {
7363         InLoopReductionImmediateChains[I] = LastChain;
7364         LastChain = I;
7365       }
7366     }
7367     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7368                       << " reduction for phi: " << *Phi << "\n");
7369   }
7370 }
7371 
7372 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
7373                                DebugLoc DL, const Twine &Name) {
7374   assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7375          Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7376   return tryInsertInstruction(
7377       new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7378 }
7379 
7380 // This function will select a scalable VF if the target supports scalable
7381 // vectors and a fixed one otherwise.
7382 // TODO: we could return a pair of values that specify the max VF and
7383 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7384 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7385 // doesn't have a cost model that can choose which plan to execute if
7386 // more than one is generated.
7387 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7388                                      LoopVectorizationCostModel &CM) {
7389   unsigned WidestType;
7390   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7391 
7392   TargetTransformInfo::RegisterKind RegKind =
7393       TTI.enableScalableVectorization()
7394           ? TargetTransformInfo::RGK_ScalableVector
7395           : TargetTransformInfo::RGK_FixedWidthVector;
7396 
7397   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7398   unsigned N = RegSize.getKnownMinValue() / WidestType;
7399   return ElementCount::get(N, RegSize.isScalable());
7400 }
7401 
7402 VectorizationFactor
7403 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7404   ElementCount VF = UserVF;
7405   // Outer loop handling: They may require CFG and instruction level
7406   // transformations before even evaluating whether vectorization is profitable.
7407   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7408   // the vectorization pipeline.
7409   if (!OrigLoop->isInnermost()) {
7410     // If the user doesn't provide a vectorization factor, determine a
7411     // reasonable one.
7412     if (UserVF.isZero()) {
7413       VF = determineVPlanVF(TTI, CM);
7414       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7415 
7416       // Make sure we have a VF > 1 for stress testing.
7417       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7418         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7419                           << "overriding computed VF.\n");
7420         VF = ElementCount::getFixed(4);
7421       }
7422     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7423                !ForceTargetSupportsScalableVectors) {
7424       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7425                         << "not supported by the target.\n");
7426       reportVectorizationFailure(
7427           "Scalable vectorization requested but not supported by the target",
7428           "the scalable user-specified vectorization width for outer-loop "
7429           "vectorization cannot be used because the target does not support "
7430           "scalable vectors.",
7431           "ScalableVFUnfeasible", ORE, OrigLoop);
7432       return VectorizationFactor::Disabled();
7433     }
7434     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7435     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7436            "VF needs to be a power of two");
7437     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7438                       << "VF " << VF << " to build VPlans.\n");
7439     buildVPlans(VF, VF);
7440 
7441     // For VPlan build stress testing, we bail out after VPlan construction.
7442     if (VPlanBuildStressTest)
7443       return VectorizationFactor::Disabled();
7444 
7445     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7446   }
7447 
7448   LLVM_DEBUG(
7449       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7450                 "VPlan-native path.\n");
7451   return VectorizationFactor::Disabled();
7452 }
7453 
7454 std::optional<VectorizationFactor>
7455 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7456   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7457   CM.collectValuesToIgnore();
7458   CM.collectElementTypesForWidening();
7459 
7460   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7461   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7462     return std::nullopt;
7463 
7464   // Invalidate interleave groups if all blocks of loop will be predicated.
7465   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7466       !useMaskedInterleavedAccesses(TTI)) {
7467     LLVM_DEBUG(
7468         dbgs()
7469         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7470            "which requires masked-interleaved support.\n");
7471     if (CM.InterleaveInfo.invalidateGroups())
7472       // Invalidating interleave groups also requires invalidating all decisions
7473       // based on them, which includes widening decisions and uniform and scalar
7474       // values.
7475       CM.invalidateCostModelingDecisions();
7476   }
7477 
7478   ElementCount MaxUserVF =
7479       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7480   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7481   if (!UserVF.isZero() && UserVFIsLegal) {
7482     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7483            "VF needs to be a power of two");
7484     // Collect the instructions (and their associated costs) that will be more
7485     // profitable to scalarize.
7486     CM.collectInLoopReductions();
7487     if (CM.selectUserVectorizationFactor(UserVF)) {
7488       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7489       buildVPlansWithVPRecipes(UserVF, UserVF);
7490       if (!hasPlanWithVF(UserVF)) {
7491         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7492                           << ".\n");
7493         return std::nullopt;
7494       }
7495 
7496       LLVM_DEBUG(printPlans(dbgs()));
7497       return {{UserVF, 0, 0}};
7498     } else
7499       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7500                               "InvalidCost", ORE, OrigLoop);
7501   }
7502 
7503   // Populate the set of Vectorization Factor Candidates.
7504   ElementCountSet VFCandidates;
7505   for (auto VF = ElementCount::getFixed(1);
7506        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7507     VFCandidates.insert(VF);
7508   for (auto VF = ElementCount::getScalable(1);
7509        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7510     VFCandidates.insert(VF);
7511 
7512   CM.collectInLoopReductions();
7513   for (const auto &VF : VFCandidates) {
7514     // Collect Uniform and Scalar instructions after vectorization with VF.
7515     CM.collectUniformsAndScalars(VF);
7516 
7517     // Collect the instructions (and their associated costs) that will be more
7518     // profitable to scalarize.
7519     if (VF.isVector())
7520       CM.collectInstsToScalarize(VF);
7521   }
7522 
7523   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7524   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7525 
7526   LLVM_DEBUG(printPlans(dbgs()));
7527   if (!MaxFactors.hasVector())
7528     return VectorizationFactor::Disabled();
7529 
7530   // Select the optimal vectorization factor.
7531   VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7532   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7533   if (!hasPlanWithVF(VF.Width)) {
7534     LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7535                       << ".\n");
7536     return std::nullopt;
7537   }
7538   return VF;
7539 }
7540 
7541 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7542   assert(count_if(VPlans,
7543                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7544              1 &&
7545          "Best VF has not a single VPlan.");
7546 
7547   for (const VPlanPtr &Plan : VPlans) {
7548     if (Plan->hasVF(VF))
7549       return *Plan.get();
7550   }
7551   llvm_unreachable("No plan found!");
7552 }
7553 
7554 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7555   SmallVector<Metadata *, 4> MDs;
7556   // Reserve first location for self reference to the LoopID metadata node.
7557   MDs.push_back(nullptr);
7558   bool IsUnrollMetadata = false;
7559   MDNode *LoopID = L->getLoopID();
7560   if (LoopID) {
7561     // First find existing loop unrolling disable metadata.
7562     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7563       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7564       if (MD) {
7565         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7566         IsUnrollMetadata =
7567             S && S->getString().starts_with("llvm.loop.unroll.disable");
7568       }
7569       MDs.push_back(LoopID->getOperand(i));
7570     }
7571   }
7572 
7573   if (!IsUnrollMetadata) {
7574     // Add runtime unroll disable metadata.
7575     LLVMContext &Context = L->getHeader()->getContext();
7576     SmallVector<Metadata *, 1> DisableOperands;
7577     DisableOperands.push_back(
7578         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7579     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7580     MDs.push_back(DisableNode);
7581     MDNode *NewLoopID = MDNode::get(Context, MDs);
7582     // Set operand 0 to refer to the loop id itself.
7583     NewLoopID->replaceOperandWith(0, NewLoopID);
7584     L->setLoopID(NewLoopID);
7585   }
7586 }
7587 
7588 SCEV2ValueTy LoopVectorizationPlanner::executePlan(
7589     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7590     InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7591     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7592   assert(BestVPlan.hasVF(BestVF) &&
7593          "Trying to execute plan with unsupported VF");
7594   assert(BestVPlan.hasUF(BestUF) &&
7595          "Trying to execute plan with unsupported UF");
7596   assert(
7597       (IsEpilogueVectorization || !ExpandedSCEVs) &&
7598       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7599 
7600   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7601                     << '\n');
7602 
7603   if (!IsEpilogueVectorization)
7604     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7605 
7606   // Perform the actual loop transformation.
7607   VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7608                          OrigLoop->getHeader()->getContext());
7609 
7610   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7611   // before making any changes to the CFG.
7612   if (!BestVPlan.getPreheader()->empty()) {
7613     State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7614     State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7615     BestVPlan.getPreheader()->execute(&State);
7616   }
7617   if (!ILV.getTripCount())
7618     ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7619   else
7620     assert(IsEpilogueVectorization && "should only re-use the existing trip "
7621                                       "count during epilogue vectorization");
7622 
7623   // 1. Set up the skeleton for vectorization, including vector pre-header and
7624   // middle block. The vector loop is created during VPlan execution.
7625   Value *CanonicalIVStartValue;
7626   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7627       ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7628                                                      : State.ExpandedSCEVs);
7629 
7630   // Only use noalias metadata when using memory checks guaranteeing no overlap
7631   // across all iterations.
7632   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7633   std::unique_ptr<LoopVersioning> LVer = nullptr;
7634   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7635       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7636 
7637     //  We currently don't use LoopVersioning for the actual loop cloning but we
7638     //  still use it to add the noalias metadata.
7639     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7640     //        metadata.
7641     LVer = std::make_unique<LoopVersioning>(
7642         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7643         PSE.getSE());
7644     State.LVer = &*LVer;
7645     State.LVer->prepareNoAliasMetadata();
7646   }
7647 
7648   ILV.collectPoisonGeneratingRecipes(State);
7649 
7650   ILV.printDebugTracesAtStart();
7651 
7652   //===------------------------------------------------===//
7653   //
7654   // Notice: any optimization or new instruction that go
7655   // into the code below should also be implemented in
7656   // the cost-model.
7657   //
7658   //===------------------------------------------------===//
7659 
7660   // 2. Copy and widen instructions from the old loop into the new loop.
7661   BestVPlan.prepareToExecute(ILV.getTripCount(),
7662                              ILV.getOrCreateVectorTripCount(nullptr),
7663                              CanonicalIVStartValue, State);
7664 
7665   BestVPlan.execute(&State);
7666 
7667   // Keep all loop hints from the original loop on the vector loop (we'll
7668   // replace the vectorizer-specific hints below).
7669   MDNode *OrigLoopID = OrigLoop->getLoopID();
7670 
7671   std::optional<MDNode *> VectorizedLoopID =
7672       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7673                                       LLVMLoopVectorizeFollowupVectorized});
7674 
7675   VPBasicBlock *HeaderVPBB =
7676       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7677   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7678   if (VectorizedLoopID)
7679     L->setLoopID(*VectorizedLoopID);
7680   else {
7681     // Keep all loop hints from the original loop on the vector loop (we'll
7682     // replace the vectorizer-specific hints below).
7683     if (MDNode *LID = OrigLoop->getLoopID())
7684       L->setLoopID(LID);
7685 
7686     LoopVectorizeHints Hints(L, true, *ORE);
7687     Hints.setAlreadyVectorized();
7688   }
7689   TargetTransformInfo::UnrollingPreferences UP;
7690   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7691   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7692     AddRuntimeUnrollDisableMetaData(L);
7693 
7694   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7695   //    predication, updating analyses.
7696   ILV.fixVectorizedLoop(State, BestVPlan);
7697 
7698   ILV.printDebugTracesAtEnd();
7699 
7700   return State.ExpandedSCEVs;
7701 }
7702 
7703 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7704 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7705   for (const auto &Plan : VPlans)
7706     if (PrintVPlansInDotFormat)
7707       Plan->printDOT(O);
7708     else
7709       Plan->print(O);
7710 }
7711 #endif
7712 
7713 //===--------------------------------------------------------------------===//
7714 // EpilogueVectorizerMainLoop
7715 //===--------------------------------------------------------------------===//
7716 
7717 /// This function is partially responsible for generating the control flow
7718 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7719 std::pair<BasicBlock *, Value *>
7720 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7721     const SCEV2ValueTy &ExpandedSCEVs) {
7722   createVectorLoopSkeleton("");
7723 
7724   // Generate the code to check the minimum iteration count of the vector
7725   // epilogue (see below).
7726   EPI.EpilogueIterationCountCheck =
7727       emitIterationCountCheck(LoopScalarPreHeader, true);
7728   EPI.EpilogueIterationCountCheck->setName("iter.check");
7729 
7730   // Generate the code to check any assumptions that we've made for SCEV
7731   // expressions.
7732   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7733 
7734   // Generate the code that checks at runtime if arrays overlap. We put the
7735   // checks into a separate block to make the more common case of few elements
7736   // faster.
7737   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7738 
7739   // Generate the iteration count check for the main loop, *after* the check
7740   // for the epilogue loop, so that the path-length is shorter for the case
7741   // that goes directly through the vector epilogue. The longer-path length for
7742   // the main loop is compensated for, by the gain from vectorizing the larger
7743   // trip count. Note: the branch will get updated later on when we vectorize
7744   // the epilogue.
7745   EPI.MainLoopIterationCountCheck =
7746       emitIterationCountCheck(LoopScalarPreHeader, false);
7747 
7748   // Generate the induction variable.
7749   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7750 
7751   // Skip induction resume value creation here because they will be created in
7752   // the second pass for the scalar loop. The induction resume values for the
7753   // inductions in the epilogue loop are created before executing the plan for
7754   // the epilogue loop.
7755 
7756   return {completeLoopSkeleton(), nullptr};
7757 }
7758 
7759 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7760   LLVM_DEBUG({
7761     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7762            << "Main Loop VF:" << EPI.MainLoopVF
7763            << ", Main Loop UF:" << EPI.MainLoopUF
7764            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7765            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7766   });
7767 }
7768 
7769 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7770   DEBUG_WITH_TYPE(VerboseDebug, {
7771     dbgs() << "intermediate fn:\n"
7772            << *OrigLoop->getHeader()->getParent() << "\n";
7773   });
7774 }
7775 
7776 BasicBlock *
7777 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7778                                                     bool ForEpilogue) {
7779   assert(Bypass && "Expected valid bypass basic block.");
7780   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7781   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7782   Value *Count = getTripCount();
7783   // Reuse existing vector loop preheader for TC checks.
7784   // Note that new preheader block is generated for vector loop.
7785   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7786   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7787 
7788   // Generate code to check if the loop's trip count is less than VF * UF of the
7789   // main vector loop.
7790   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7791                                                     : VF.isVector())
7792                ? ICmpInst::ICMP_ULE
7793                : ICmpInst::ICMP_ULT;
7794 
7795   Value *CheckMinIters = Builder.CreateICmp(
7796       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7797       "min.iters.check");
7798 
7799   if (!ForEpilogue)
7800     TCCheckBlock->setName("vector.main.loop.iter.check");
7801 
7802   // Create new preheader for vector loop.
7803   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7804                                    DT, LI, nullptr, "vector.ph");
7805 
7806   if (ForEpilogue) {
7807     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7808                                  DT->getNode(Bypass)->getIDom()) &&
7809            "TC check is expected to dominate Bypass");
7810 
7811     // Update dominator for Bypass & LoopExit.
7812     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7813     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7814       // For loops with multiple exits, there's no edge from the middle block
7815       // to exit blocks (as the epilogue must run) and thus no need to update
7816       // the immediate dominator of the exit blocks.
7817       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7818 
7819     LoopBypassBlocks.push_back(TCCheckBlock);
7820 
7821     // Save the trip count so we don't have to regenerate it in the
7822     // vec.epilog.iter.check. This is safe to do because the trip count
7823     // generated here dominates the vector epilog iter check.
7824     EPI.TripCount = Count;
7825   }
7826 
7827   BranchInst &BI =
7828       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7829   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7830     setBranchWeights(BI, MinItersBypassWeights);
7831   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7832 
7833   return TCCheckBlock;
7834 }
7835 
7836 //===--------------------------------------------------------------------===//
7837 // EpilogueVectorizerEpilogueLoop
7838 //===--------------------------------------------------------------------===//
7839 
7840 /// This function is partially responsible for generating the control flow
7841 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7842 std::pair<BasicBlock *, Value *>
7843 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7844     const SCEV2ValueTy &ExpandedSCEVs) {
7845   createVectorLoopSkeleton("vec.epilog.");
7846 
7847   // Now, compare the remaining count and if there aren't enough iterations to
7848   // execute the vectorized epilogue skip to the scalar part.
7849   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7850   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7851   LoopVectorPreHeader =
7852       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7853                  LI, nullptr, "vec.epilog.ph");
7854   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7855                                           VecEpilogueIterationCountCheck);
7856 
7857   // Adjust the control flow taking the state info from the main loop
7858   // vectorization into account.
7859   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7860          "expected this to be saved from the previous pass.");
7861   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7862       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7863 
7864   DT->changeImmediateDominator(LoopVectorPreHeader,
7865                                EPI.MainLoopIterationCountCheck);
7866 
7867   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7868       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7869 
7870   if (EPI.SCEVSafetyCheck)
7871     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7872         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7873   if (EPI.MemSafetyCheck)
7874     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7875         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7876 
7877   DT->changeImmediateDominator(
7878       VecEpilogueIterationCountCheck,
7879       VecEpilogueIterationCountCheck->getSinglePredecessor());
7880 
7881   DT->changeImmediateDominator(LoopScalarPreHeader,
7882                                EPI.EpilogueIterationCountCheck);
7883   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7884     // If there is an epilogue which must run, there's no edge from the
7885     // middle block to exit blocks  and thus no need to update the immediate
7886     // dominator of the exit blocks.
7887     DT->changeImmediateDominator(LoopExitBlock,
7888                                  EPI.EpilogueIterationCountCheck);
7889 
7890   // Keep track of bypass blocks, as they feed start values to the induction and
7891   // reduction phis in the scalar loop preheader.
7892   if (EPI.SCEVSafetyCheck)
7893     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7894   if (EPI.MemSafetyCheck)
7895     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7896   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7897 
7898   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7899   // reductions which merge control-flow from the latch block and the middle
7900   // block. Update the incoming values here and move the Phi into the preheader.
7901   SmallVector<PHINode *, 4> PhisInBlock;
7902   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7903     PhisInBlock.push_back(&Phi);
7904 
7905   for (PHINode *Phi : PhisInBlock) {
7906     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7907     Phi->replaceIncomingBlockWith(
7908         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7909         VecEpilogueIterationCountCheck);
7910 
7911     // If the phi doesn't have an incoming value from the
7912     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7913     // value and also those from other check blocks. This is needed for
7914     // reduction phis only.
7915     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7916           return EPI.EpilogueIterationCountCheck == IncB;
7917         }))
7918       continue;
7919     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7920     if (EPI.SCEVSafetyCheck)
7921       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7922     if (EPI.MemSafetyCheck)
7923       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7924   }
7925 
7926   // Generate a resume induction for the vector epilogue and put it in the
7927   // vector epilogue preheader
7928   Type *IdxTy = Legal->getWidestInductionType();
7929   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7930   EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7931   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7932   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7933                            EPI.MainLoopIterationCountCheck);
7934 
7935   // Generate induction resume values. These variables save the new starting
7936   // indexes for the scalar loop. They are used to test if there are any tail
7937   // iterations left once the vector loop has completed.
7938   // Note that when the vectorized epilogue is skipped due to iteration count
7939   // check, then the resume value for the induction variable comes from
7940   // the trip count of the main vector loop, hence passing the AdditionalBypass
7941   // argument.
7942   createInductionResumeValues(ExpandedSCEVs,
7943                               {VecEpilogueIterationCountCheck,
7944                                EPI.VectorTripCount} /* AdditionalBypass */);
7945 
7946   return {completeLoopSkeleton(), EPResumeVal};
7947 }
7948 
7949 BasicBlock *
7950 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7951     BasicBlock *Bypass, BasicBlock *Insert) {
7952 
7953   assert(EPI.TripCount &&
7954          "Expected trip count to have been safed in the first pass.");
7955   assert(
7956       (!isa<Instruction>(EPI.TripCount) ||
7957        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7958       "saved trip count does not dominate insertion point.");
7959   Value *TC = EPI.TripCount;
7960   IRBuilder<> Builder(Insert->getTerminator());
7961   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7962 
7963   // Generate code to check if the loop's trip count is less than VF * UF of the
7964   // vector epilogue loop.
7965   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7966                ? ICmpInst::ICMP_ULE
7967                : ICmpInst::ICMP_ULT;
7968 
7969   Value *CheckMinIters =
7970       Builder.CreateICmp(P, Count,
7971                          createStepForVF(Builder, Count->getType(),
7972                                          EPI.EpilogueVF, EPI.EpilogueUF),
7973                          "min.epilog.iters.check");
7974 
7975   BranchInst &BI =
7976       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7977   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7978     unsigned MainLoopStep = UF * VF.getKnownMinValue();
7979     unsigned EpilogueLoopStep =
7980         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7981     // We assume the remaining `Count` is equally distributed in
7982     // [0, MainLoopStep)
7983     // So the probability for `Count < EpilogueLoopStep` should be
7984     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7985     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7986     const uint32_t Weights[] = {EstimatedSkipCount,
7987                                 MainLoopStep - EstimatedSkipCount};
7988     setBranchWeights(BI, Weights);
7989   }
7990   ReplaceInstWithInst(Insert->getTerminator(), &BI);
7991 
7992   LoopBypassBlocks.push_back(Insert);
7993   return Insert;
7994 }
7995 
7996 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7997   LLVM_DEBUG({
7998     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7999            << "Epilogue Loop VF:" << EPI.EpilogueVF
8000            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8001   });
8002 }
8003 
8004 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8005   DEBUG_WITH_TYPE(VerboseDebug, {
8006     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8007   });
8008 }
8009 
8010 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8011     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8012   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8013   bool PredicateAtRangeStart = Predicate(Range.Start);
8014 
8015   for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
8016     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8017       Range.End = TmpVF;
8018       break;
8019     }
8020 
8021   return PredicateAtRangeStart;
8022 }
8023 
8024 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8025 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8026 /// of VF's starting at a given VF and extending it as much as possible. Each
8027 /// vectorization decision can potentially shorten this sub-range during
8028 /// buildVPlan().
8029 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8030                                            ElementCount MaxVF) {
8031   auto MaxVFTimes2 = MaxVF * 2;
8032   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8033     VFRange SubRange = {VF, MaxVFTimes2};
8034     VPlans.push_back(buildVPlan(SubRange));
8035     VF = SubRange.End;
8036   }
8037 }
8038 
8039 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8040                                          VPlan &Plan) {
8041   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8042 
8043   // Look for cached value.
8044   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8045   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8046   if (ECEntryIt != EdgeMaskCache.end())
8047     return ECEntryIt->second;
8048 
8049   VPValue *SrcMask = createBlockInMask(Src, Plan);
8050 
8051   // The terminator has to be a branch inst!
8052   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8053   assert(BI && "Unexpected terminator found");
8054 
8055   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8056     return EdgeMaskCache[Edge] = SrcMask;
8057 
8058   // If source is an exiting block, we know the exit edge is dynamically dead
8059   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8060   // adding uses of an otherwise potentially dead instruction.
8061   if (OrigLoop->isLoopExiting(Src))
8062     return EdgeMaskCache[Edge] = SrcMask;
8063 
8064   VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
8065   assert(EdgeMask && "No Edge Mask found for condition");
8066 
8067   if (BI->getSuccessor(0) != Dst)
8068     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8069 
8070   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8071     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8072     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8073     // The select version does not introduce new UB if SrcMask is false and
8074     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8075     VPValue *False = Plan.getVPValueOrAddLiveIn(
8076         ConstantInt::getFalse(BI->getCondition()->getType()));
8077     EdgeMask =
8078         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8079   }
8080 
8081   return EdgeMaskCache[Edge] = EdgeMask;
8082 }
8083 
8084 void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
8085   BasicBlock *Header = OrigLoop->getHeader();
8086 
8087   // When not folding the tail, use nullptr to model all-true mask.
8088   if (!CM.foldTailByMasking()) {
8089     BlockMaskCache[Header] = nullptr;
8090     return;
8091   }
8092 
8093   // Introduce the early-exit compare IV <= BTC to form header block mask.
8094   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8095   // constructing the desired canonical IV in the header block as its first
8096   // non-phi instructions.
8097 
8098   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8099   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8100   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8101   HeaderVPBB->insert(IV, NewInsertionPoint);
8102 
8103   VPBuilder::InsertPointGuard Guard(Builder);
8104   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8105   VPValue *BlockMask = nullptr;
8106   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8107   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8108   BlockMaskCache[Header] = BlockMask;
8109 }
8110 
8111 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8112   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8113 
8114   // Look for cached value.
8115   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8116   if (BCEntryIt != BlockMaskCache.end())
8117     return BCEntryIt->second;
8118 
8119   assert(OrigLoop->getHeader() != BB &&
8120          "Loop header must have cached block mask");
8121 
8122   // All-one mask is modelled as no-mask following the convention for masked
8123   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8124   VPValue *BlockMask = nullptr;
8125   // This is the block mask. We OR all incoming edges.
8126   for (auto *Predecessor : predecessors(BB)) {
8127     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8128     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8129       return BlockMaskCache[BB] = EdgeMask;
8130 
8131     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8132       BlockMask = EdgeMask;
8133       continue;
8134     }
8135 
8136     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8137   }
8138 
8139   return BlockMaskCache[BB] = BlockMask;
8140 }
8141 
8142 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8143                                                 ArrayRef<VPValue *> Operands,
8144                                                 VFRange &Range,
8145                                                 VPlanPtr &Plan) {
8146   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8147          "Must be called with either a load or store");
8148 
8149   auto willWiden = [&](ElementCount VF) -> bool {
8150     LoopVectorizationCostModel::InstWidening Decision =
8151         CM.getWideningDecision(I, VF);
8152     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8153            "CM decision should be taken at this point.");
8154     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8155       return true;
8156     if (CM.isScalarAfterVectorization(I, VF) ||
8157         CM.isProfitableToScalarize(I, VF))
8158       return false;
8159     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8160   };
8161 
8162   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8163     return nullptr;
8164 
8165   VPValue *Mask = nullptr;
8166   if (Legal->isMaskRequired(I))
8167     Mask = createBlockInMask(I->getParent(), *Plan);
8168 
8169   // Determine if the pointer operand of the access is either consecutive or
8170   // reverse consecutive.
8171   LoopVectorizationCostModel::InstWidening Decision =
8172       CM.getWideningDecision(I, Range.Start);
8173   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8174   bool Consecutive =
8175       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8176 
8177   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8178     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8179                                               Consecutive, Reverse);
8180 
8181   StoreInst *Store = cast<StoreInst>(I);
8182   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8183                                             Mask, Consecutive, Reverse);
8184 }
8185 
8186 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8187 /// insert a recipe to expand the step for the induction recipe.
8188 static VPWidenIntOrFpInductionRecipe *
8189 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8190                             VPValue *Start, const InductionDescriptor &IndDesc,
8191                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8192                             VFRange &Range) {
8193   assert(IndDesc.getStartValue() ==
8194          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8195   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8196          "step must be loop invariant");
8197 
8198   VPValue *Step =
8199       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8200   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8201     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8202   }
8203   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8204   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8205 }
8206 
8207 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8208     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8209 
8210   // Check if this is an integer or fp induction. If so, build the recipe that
8211   // produces its scalar and vector values.
8212   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8213     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8214                                        *PSE.getSE(), *OrigLoop, Range);
8215 
8216   // Check if this is pointer induction. If so, build the recipe for it.
8217   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8218     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8219                                                            *PSE.getSE());
8220     return new VPWidenPointerInductionRecipe(
8221         Phi, Operands[0], Step, *II,
8222         LoopVectorizationPlanner::getDecisionAndClampRange(
8223             [&](ElementCount VF) {
8224               return CM.isScalarAfterVectorization(Phi, VF);
8225             },
8226             Range));
8227   }
8228   return nullptr;
8229 }
8230 
8231 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8232     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8233   // Optimize the special case where the source is a constant integer
8234   // induction variable. Notice that we can only optimize the 'trunc' case
8235   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8236   // (c) other casts depend on pointer size.
8237 
8238   // Determine whether \p K is a truncation based on an induction variable that
8239   // can be optimized.
8240   auto isOptimizableIVTruncate =
8241       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8242     return [=](ElementCount VF) -> bool {
8243       return CM.isOptimizableIVTruncate(K, VF);
8244     };
8245   };
8246 
8247   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8248           isOptimizableIVTruncate(I), Range)) {
8249 
8250     auto *Phi = cast<PHINode>(I->getOperand(0));
8251     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8252     VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8253     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8254                                        *OrigLoop, Range);
8255   }
8256   return nullptr;
8257 }
8258 
8259 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8260                                                 ArrayRef<VPValue *> Operands,
8261                                                 VPlanPtr &Plan) {
8262   // If all incoming values are equal, the incoming VPValue can be used directly
8263   // instead of creating a new VPBlendRecipe.
8264   if (llvm::all_equal(Operands))
8265     return Operands[0];
8266 
8267   unsigned NumIncoming = Phi->getNumIncomingValues();
8268   // For in-loop reductions, we do not need to create an additional select.
8269   VPValue *InLoopVal = nullptr;
8270   for (unsigned In = 0; In < NumIncoming; In++) {
8271     PHINode *PhiOp =
8272         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8273     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8274       assert(!InLoopVal && "Found more than one in-loop reduction!");
8275       InLoopVal = Operands[In];
8276     }
8277   }
8278 
8279   assert((!InLoopVal || NumIncoming == 2) &&
8280          "Found an in-loop reduction for PHI with unexpected number of "
8281          "incoming values");
8282   if (InLoopVal)
8283     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8284 
8285   // We know that all PHIs in non-header blocks are converted into selects, so
8286   // we don't have to worry about the insertion order and we can just use the
8287   // builder. At this point we generate the predication tree. There may be
8288   // duplications since this is a simple recursive scan, but future
8289   // optimizations will clean it up.
8290   SmallVector<VPValue *, 2> OperandsWithMask;
8291 
8292   for (unsigned In = 0; In < NumIncoming; In++) {
8293     VPValue *EdgeMask =
8294         createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
8295     assert((EdgeMask || NumIncoming == 1) &&
8296            "Multiple predecessors with one having a full mask");
8297     OperandsWithMask.push_back(Operands[In]);
8298     if (EdgeMask)
8299       OperandsWithMask.push_back(EdgeMask);
8300   }
8301   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8302 }
8303 
8304 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8305                                                    ArrayRef<VPValue *> Operands,
8306                                                    VFRange &Range,
8307                                                    VPlanPtr &Plan) {
8308   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8309       [this, CI](ElementCount VF) {
8310         return CM.isScalarWithPredication(CI, VF);
8311       },
8312       Range);
8313 
8314   if (IsPredicated)
8315     return nullptr;
8316 
8317   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8318   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8319              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8320              ID == Intrinsic::pseudoprobe ||
8321              ID == Intrinsic::experimental_noalias_scope_decl))
8322     return nullptr;
8323 
8324   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8325 
8326   // Is it beneficial to perform intrinsic call compared to lib call?
8327   bool ShouldUseVectorIntrinsic =
8328       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8329                 [&](ElementCount VF) -> bool {
8330                   return CM.getCallWideningDecision(CI, VF).Kind ==
8331                          LoopVectorizationCostModel::CM_IntrinsicCall;
8332                 },
8333                 Range);
8334   if (ShouldUseVectorIntrinsic)
8335     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
8336 
8337   Function *Variant = nullptr;
8338   std::optional<unsigned> MaskPos;
8339   // Is better to call a vectorized version of the function than to to scalarize
8340   // the call?
8341   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8342       [&](ElementCount VF) -> bool {
8343         // The following case may be scalarized depending on the VF.
8344         // The flag shows whether we can use a usual Call for vectorized
8345         // version of the instruction.
8346 
8347         // If we've found a variant at a previous VF, then stop looking. A
8348         // vectorized variant of a function expects input in a certain shape
8349         // -- basically the number of input registers, the number of lanes
8350         // per register, and whether there's a mask required.
8351         // We store a pointer to the variant in the VPWidenCallRecipe, so
8352         // once we have an appropriate variant it's only valid for that VF.
8353         // This will force a different vplan to be generated for each VF that
8354         // finds a valid variant.
8355         if (Variant)
8356           return false;
8357         LoopVectorizationCostModel::CallWideningDecision Decision =
8358             CM.getCallWideningDecision(CI, VF);
8359         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8360           Variant = Decision.Variant;
8361           MaskPos = Decision.MaskPos;
8362           return true;
8363         }
8364 
8365         return false;
8366       },
8367       Range);
8368   if (ShouldUseVectorCall) {
8369     if (MaskPos.has_value()) {
8370       // We have 2 cases that would require a mask:
8371       //   1) The block needs to be predicated, either due to a conditional
8372       //      in the scalar loop or use of an active lane mask with
8373       //      tail-folding, and we use the appropriate mask for the block.
8374       //   2) No mask is required for the block, but the only available
8375       //      vector variant at this VF requires a mask, so we synthesize an
8376       //      all-true mask.
8377       VPValue *Mask = nullptr;
8378       if (Legal->isMaskRequired(CI))
8379         Mask = createBlockInMask(CI->getParent(), *Plan);
8380       else
8381         Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8382             IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8383 
8384       Ops.insert(Ops.begin() + *MaskPos, Mask);
8385     }
8386 
8387     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8388                                  Intrinsic::not_intrinsic, Variant);
8389   }
8390 
8391   return nullptr;
8392 }
8393 
8394 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8395   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8396          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8397   // Instruction should be widened, unless it is scalar after vectorization,
8398   // scalarization is profitable or it is predicated.
8399   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8400     return CM.isScalarAfterVectorization(I, VF) ||
8401            CM.isProfitableToScalarize(I, VF) ||
8402            CM.isScalarWithPredication(I, VF);
8403   };
8404   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8405                                                              Range);
8406 }
8407 
8408 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8409                                           ArrayRef<VPValue *> Operands,
8410                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
8411   switch (I->getOpcode()) {
8412   default:
8413     return nullptr;
8414   case Instruction::SDiv:
8415   case Instruction::UDiv:
8416   case Instruction::SRem:
8417   case Instruction::URem: {
8418     // If not provably safe, use a select to form a safe divisor before widening the
8419     // div/rem operation itself.  Otherwise fall through to general handling below.
8420     if (CM.isPredicatedInst(I)) {
8421       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8422       VPValue *Mask = createBlockInMask(I->getParent(), *Plan);
8423       VPValue *One = Plan->getVPValueOrAddLiveIn(
8424           ConstantInt::get(I->getType(), 1u, false));
8425       auto *SafeRHS =
8426          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8427                            I->getDebugLoc());
8428       VPBB->appendRecipe(SafeRHS);
8429       Ops[1] = SafeRHS;
8430       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8431     }
8432     [[fallthrough]];
8433   }
8434   case Instruction::Add:
8435   case Instruction::And:
8436   case Instruction::AShr:
8437   case Instruction::FAdd:
8438   case Instruction::FCmp:
8439   case Instruction::FDiv:
8440   case Instruction::FMul:
8441   case Instruction::FNeg:
8442   case Instruction::FRem:
8443   case Instruction::FSub:
8444   case Instruction::ICmp:
8445   case Instruction::LShr:
8446   case Instruction::Mul:
8447   case Instruction::Or:
8448   case Instruction::Select:
8449   case Instruction::Shl:
8450   case Instruction::Sub:
8451   case Instruction::Xor:
8452   case Instruction::Freeze:
8453     return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8454   };
8455 }
8456 
8457 void VPRecipeBuilder::fixHeaderPhis() {
8458   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8459   for (VPHeaderPHIRecipe *R : PhisToFix) {
8460     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8461     VPRecipeBase *IncR =
8462         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8463     R->addOperand(IncR->getVPSingleValue());
8464   }
8465 }
8466 
8467 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
8468                                                        VFRange &Range,
8469                                                        VPlan &Plan) {
8470   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8471       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8472       Range);
8473 
8474   bool IsPredicated = CM.isPredicatedInst(I);
8475 
8476   // Even if the instruction is not marked as uniform, there are certain
8477   // intrinsic calls that can be effectively treated as such, so we check for
8478   // them here. Conservatively, we only do this for scalable vectors, since
8479   // for fixed-width VFs we can always fall back on full scalarization.
8480   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8481     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8482     case Intrinsic::assume:
8483     case Intrinsic::lifetime_start:
8484     case Intrinsic::lifetime_end:
8485       // For scalable vectors if one of the operands is variant then we still
8486       // want to mark as uniform, which will generate one instruction for just
8487       // the first lane of the vector. We can't scalarize the call in the same
8488       // way as for fixed-width vectors because we don't know how many lanes
8489       // there are.
8490       //
8491       // The reasons for doing it this way for scalable vectors are:
8492       //   1. For the assume intrinsic generating the instruction for the first
8493       //      lane is still be better than not generating any at all. For
8494       //      example, the input may be a splat across all lanes.
8495       //   2. For the lifetime start/end intrinsics the pointer operand only
8496       //      does anything useful when the input comes from a stack object,
8497       //      which suggests it should always be uniform. For non-stack objects
8498       //      the effect is to poison the object, which still allows us to
8499       //      remove the call.
8500       IsUniform = true;
8501       break;
8502     default:
8503       break;
8504     }
8505   }
8506   VPValue *BlockInMask = nullptr;
8507   if (!IsPredicated) {
8508     // Finalize the recipe for Instr, first if it is not predicated.
8509     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8510   } else {
8511     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8512     // Instructions marked for predication are replicated and a mask operand is
8513     // added initially. Masked replicate recipes will later be placed under an
8514     // if-then construct to prevent side-effects. Generate recipes to compute
8515     // the block mask for this region.
8516     BlockInMask = createBlockInMask(I->getParent(), Plan);
8517   }
8518 
8519   auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
8520                                        IsUniform, BlockInMask);
8521   return toVPRecipeResult(Recipe);
8522 }
8523 
8524 VPRecipeOrVPValueTy
8525 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8526                                         ArrayRef<VPValue *> Operands,
8527                                         VFRange &Range, VPBasicBlock *VPBB,
8528                                         VPlanPtr &Plan) {
8529   // First, check for specific widening recipes that deal with inductions, Phi
8530   // nodes, calls and memory operations.
8531   VPRecipeBase *Recipe;
8532   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8533     if (Phi->getParent() != OrigLoop->getHeader())
8534       return tryToBlend(Phi, Operands, Plan);
8535 
8536     // Always record recipes for header phis. Later first-order recurrence phis
8537     // can have earlier phis as incoming values.
8538     recordRecipeOf(Phi);
8539 
8540     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8541       return toVPRecipeResult(Recipe);
8542 
8543     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8544     assert((Legal->isReductionVariable(Phi) ||
8545             Legal->isFixedOrderRecurrence(Phi)) &&
8546            "can only widen reductions and fixed-order recurrences here");
8547     VPValue *StartV = Operands[0];
8548     if (Legal->isReductionVariable(Phi)) {
8549       const RecurrenceDescriptor &RdxDesc =
8550           Legal->getReductionVars().find(Phi)->second;
8551       assert(RdxDesc.getRecurrenceStartValue() ==
8552              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8553       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8554                                            CM.isInLoopReduction(Phi),
8555                                            CM.useOrderedReductions(RdxDesc));
8556     } else {
8557       // TODO: Currently fixed-order recurrences are modeled as chains of
8558       // first-order recurrences. If there are no users of the intermediate
8559       // recurrences in the chain, the fixed order recurrence should be modeled
8560       // directly, enabling more efficient codegen.
8561       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8562     }
8563 
8564     // Record the incoming value from the backedge, so we can add the incoming
8565     // value from the backedge after all recipes have been created.
8566     auto *Inc = cast<Instruction>(
8567         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8568     auto RecipeIter = Ingredient2Recipe.find(Inc);
8569     if (RecipeIter == Ingredient2Recipe.end())
8570       recordRecipeOf(Inc);
8571 
8572     PhisToFix.push_back(PhiRecipe);
8573     return toVPRecipeResult(PhiRecipe);
8574   }
8575 
8576   if (isa<TruncInst>(Instr) &&
8577       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8578                                                Range, *Plan)))
8579     return toVPRecipeResult(Recipe);
8580 
8581   // All widen recipes below deal only with VF > 1.
8582   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8583           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8584     return nullptr;
8585 
8586   if (auto *CI = dyn_cast<CallInst>(Instr))
8587     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
8588 
8589   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8590     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8591 
8592   if (!shouldWiden(Instr, Range))
8593     return nullptr;
8594 
8595   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8596     return toVPRecipeResult(new VPWidenGEPRecipe(
8597         GEP, make_range(Operands.begin(), Operands.end())));
8598 
8599   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8600     return toVPRecipeResult(new VPWidenSelectRecipe(
8601         *SI, make_range(Operands.begin(), Operands.end())));
8602   }
8603 
8604   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8605     return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
8606                                                   CI->getType(), *CI));
8607   }
8608 
8609   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8610 }
8611 
8612 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8613                                                         ElementCount MaxVF) {
8614   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8615 
8616   auto MaxVFTimes2 = MaxVF * 2;
8617   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8618     VFRange SubRange = {VF, MaxVFTimes2};
8619     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8620       // Now optimize the initial VPlan.
8621       if (!Plan->hasVF(ElementCount::getFixed(1)))
8622         VPlanTransforms::truncateToMinimalBitwidths(
8623             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8624       VPlanTransforms::optimize(*Plan, *PSE.getSE());
8625       assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
8626       VPlans.push_back(std::move(Plan));
8627     }
8628     VF = SubRange.End;
8629   }
8630 }
8631 
8632 // Add the necessary canonical IV and branch recipes required to control the
8633 // loop.
8634 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8635                                   DebugLoc DL) {
8636   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8637   auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
8638 
8639   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8640   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8641   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8642   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8643   Header->insert(CanonicalIVPHI, Header->begin());
8644 
8645   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8646   // IV by VF * UF.
8647   auto *CanonicalIVIncrement =
8648       new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
8649                         {HasNUW, false}, DL, "index.next");
8650   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8651 
8652   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8653   EB->appendRecipe(CanonicalIVIncrement);
8654 
8655   // Add the BranchOnCount VPInstruction to the latch.
8656   VPInstruction *BranchBack =
8657       new VPInstruction(VPInstruction::BranchOnCount,
8658                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8659   EB->appendRecipe(BranchBack);
8660 }
8661 
8662 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8663 // original exit block.
8664 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8665                                 VPlan &Plan) {
8666   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8667   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8668   // Only handle single-exit loops with unique exit blocks for now.
8669   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8670     return;
8671 
8672   // Introduce VPUsers modeling the exit values.
8673   for (PHINode &ExitPhi : ExitBB->phis()) {
8674     Value *IncomingValue =
8675         ExitPhi.getIncomingValueForBlock(ExitingBB);
8676     VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
8677     Plan.addLiveOut(&ExitPhi, V);
8678   }
8679 }
8680 
8681 VPlanPtr
8682 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8683 
8684   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8685 
8686   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8687 
8688   // ---------------------------------------------------------------------------
8689   // Pre-construction: record ingredients whose recipes we'll need to further
8690   // process after constructing the initial VPlan.
8691   // ---------------------------------------------------------------------------
8692 
8693   // For each interleave group which is relevant for this (possibly trimmed)
8694   // Range, add it to the set of groups to be later applied to the VPlan and add
8695   // placeholders for its members' Recipes which we'll be replacing with a
8696   // single VPInterleaveRecipe.
8697   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8698     auto applyIG = [IG, this](ElementCount VF) -> bool {
8699       bool Result = (VF.isVector() && // Query is illegal for VF == 1
8700                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
8701                          LoopVectorizationCostModel::CM_Interleave);
8702       // For scalable vectors, the only interleave factor currently supported
8703       // is 2 since we require the (de)interleave2 intrinsics instead of
8704       // shufflevectors.
8705       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8706              "Unsupported interleave factor for scalable vectors");
8707       return Result;
8708     };
8709     if (!getDecisionAndClampRange(applyIG, Range))
8710       continue;
8711     InterleaveGroups.insert(IG);
8712     for (unsigned i = 0; i < IG->getFactor(); i++)
8713       if (Instruction *Member = IG->getMember(i))
8714         RecipeBuilder.recordRecipeOf(Member);
8715   };
8716 
8717   // ---------------------------------------------------------------------------
8718   // Build initial VPlan: Scan the body of the loop in a topological order to
8719   // visit each basic block after having visited its predecessor basic blocks.
8720   // ---------------------------------------------------------------------------
8721 
8722   // Create initial VPlan skeleton, having a basic block for the pre-header
8723   // which contains SCEV expansions that need to happen before the CFG is
8724   // modified; a basic block for the vector pre-header, followed by a region for
8725   // the vector loop, followed by the middle basic block. The skeleton vector
8726   // loop region contains a header and latch basic blocks.
8727   VPlanPtr Plan = VPlan::createInitialVPlan(
8728       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8729       *PSE.getSE());
8730   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8731   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8732   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8733   Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8734   Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8735 
8736   // Don't use getDecisionAndClampRange here, because we don't know the UF
8737   // so this function is better to be conservative, rather than to split
8738   // it up into different VPlans.
8739   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8740   bool IVUpdateMayOverflow = false;
8741   for (ElementCount VF : Range)
8742     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8743 
8744   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8745   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8746   // When not folding the tail, we know that the induction increment will not
8747   // overflow.
8748   bool HasNUW = Style == TailFoldingStyle::None;
8749   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8750 
8751   // Proactively create header mask. Masks for other blocks are created on
8752   // demand.
8753   RecipeBuilder.createHeaderMask(*Plan);
8754 
8755   // Scan the body of the loop in a topological order to visit each basic block
8756   // after having visited its predecessor basic blocks.
8757   LoopBlocksDFS DFS(OrigLoop);
8758   DFS.perform(LI);
8759 
8760   VPBasicBlock *VPBB = HeaderVPBB;
8761   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8762     // Relevant instructions from basic block BB will be grouped into VPRecipe
8763     // ingredients and fill a new VPBasicBlock.
8764     if (VPBB != HeaderVPBB)
8765       VPBB->setName(BB->getName());
8766     Builder.setInsertPoint(VPBB);
8767 
8768     // Introduce each ingredient into VPlan.
8769     // TODO: Model and preserve debug intrinsics in VPlan.
8770     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8771       Instruction *Instr = &I;
8772       SmallVector<VPValue *, 4> Operands;
8773       auto *Phi = dyn_cast<PHINode>(Instr);
8774       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8775         Operands.push_back(Plan->getVPValueOrAddLiveIn(
8776             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8777       } else {
8778         auto OpRange = Plan->mapToVPValues(Instr->operands());
8779         Operands = {OpRange.begin(), OpRange.end()};
8780       }
8781 
8782       // Invariant stores inside loop will be deleted and a single store
8783       // with the final reduction value will be added to the exit block
8784       StoreInst *SI;
8785       if ((SI = dyn_cast<StoreInst>(&I)) &&
8786           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8787         continue;
8788 
8789       auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8790           Instr, Operands, Range, VPBB, Plan);
8791       if (!RecipeOrValue)
8792         RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
8793       // If Instr can be simplified to an existing VPValue, use it.
8794       if (isa<VPValue *>(RecipeOrValue)) {
8795         auto *VPV = cast<VPValue *>(RecipeOrValue);
8796         Plan->addVPValue(Instr, VPV);
8797         // If the re-used value is a recipe, register the recipe for the
8798         // instruction, in case the recipe for Instr needs to be recorded.
8799         if (VPRecipeBase *R = VPV->getDefiningRecipe())
8800           RecipeBuilder.setRecipe(Instr, R);
8801         continue;
8802       }
8803       // Otherwise, add the new recipe.
8804       VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
8805       for (auto *Def : Recipe->definedValues()) {
8806         auto *UV = Def->getUnderlyingValue();
8807         Plan->addVPValue(UV, Def);
8808       }
8809 
8810       RecipeBuilder.setRecipe(Instr, Recipe);
8811       if (isa<VPHeaderPHIRecipe>(Recipe)) {
8812         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8813         // the following cases, VPHeaderPHIRecipes may be created after non-phi
8814         // recipes and need to be moved to the phi section of HeaderVPBB:
8815         // * tail-folding (non-phi recipes computing the header mask are
8816         // introduced earlier than regular header phi recipes, and should appear
8817         // after them)
8818         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8819 
8820         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8821                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8822                "unexpected recipe needs moving");
8823         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8824       } else
8825         VPBB->appendRecipe(Recipe);
8826     }
8827 
8828     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8829     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8830   }
8831 
8832   // After here, VPBB should not be used.
8833   VPBB = nullptr;
8834 
8835   if (CM.requiresScalarEpilogue(Range)) {
8836     // No edge from the middle block to the unique exit block has been inserted
8837     // and there is nothing to fix from vector loop; phis should have incoming
8838     // from scalar loop only.
8839   } else
8840     addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
8841 
8842   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8843          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8844          "entry block must be set to a VPRegionBlock having a non-empty entry "
8845          "VPBasicBlock");
8846   RecipeBuilder.fixHeaderPhis();
8847 
8848   // ---------------------------------------------------------------------------
8849   // Transform initial VPlan: Apply previously taken decisions, in order, to
8850   // bring the VPlan to its final state.
8851   // ---------------------------------------------------------------------------
8852 
8853   // Adjust the recipes for any inloop reductions.
8854   adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8855 
8856   // Interleave memory: for each Interleave Group we marked earlier as relevant
8857   // for this VPlan, replace the Recipes widening its memory instructions with a
8858   // single VPInterleaveRecipe at its insertion point.
8859   for (const auto *IG : InterleaveGroups) {
8860     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8861         RecipeBuilder.getRecipe(IG->getInsertPos()));
8862     SmallVector<VPValue *, 4> StoredValues;
8863     for (unsigned i = 0; i < IG->getFactor(); ++i)
8864       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8865         auto *StoreR =
8866             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8867         StoredValues.push_back(StoreR->getStoredValue());
8868       }
8869 
8870     bool NeedsMaskForGaps =
8871         IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8872     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8873                                         Recipe->getMask(), NeedsMaskForGaps);
8874     VPIG->insertBefore(Recipe);
8875     unsigned J = 0;
8876     for (unsigned i = 0; i < IG->getFactor(); ++i)
8877       if (Instruction *Member = IG->getMember(i)) {
8878         VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8879         if (!Member->getType()->isVoidTy()) {
8880           VPValue *OriginalV = MemberR->getVPSingleValue();
8881           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8882           J++;
8883         }
8884         MemberR->eraseFromParent();
8885       }
8886   }
8887 
8888   for (ElementCount VF : Range)
8889     Plan->addVF(VF);
8890   Plan->setName("Initial VPlan");
8891 
8892   // Replace VPValues for known constant strides guaranteed by predicate scalar
8893   // evolution.
8894   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8895     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8896     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8897     // Only handle constant strides for now.
8898     if (!ScevStride)
8899       continue;
8900     Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8901 
8902     auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
8903     // The versioned value may not be used in the loop directly, so just add a
8904     // new live-in in those cases.
8905     Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8906   }
8907 
8908   // From this point onwards, VPlan-to-VPlan transformations may change the plan
8909   // in ways that accessing values using original IR values is incorrect.
8910   Plan->disableValue2VPValue();
8911 
8912   // Sink users of fixed-order recurrence past the recipe defining the previous
8913   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8914   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8915     return nullptr;
8916 
8917   if (useActiveLaneMask(Style)) {
8918     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8919     // TailFoldingStyle is visible there.
8920     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8921     bool WithoutRuntimeCheck =
8922         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8923     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8924                                        WithoutRuntimeCheck);
8925   }
8926   return Plan;
8927 }
8928 
8929 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8930   // Outer loop handling: They may require CFG and instruction level
8931   // transformations before even evaluating whether vectorization is profitable.
8932   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8933   // the vectorization pipeline.
8934   assert(!OrigLoop->isInnermost());
8935   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8936 
8937   // Create new empty VPlan
8938   auto Plan = VPlan::createInitialVPlan(
8939       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8940       *PSE.getSE());
8941 
8942   // Build hierarchical CFG
8943   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8944   HCFGBuilder.buildHierarchicalCFG();
8945 
8946   for (ElementCount VF : Range)
8947     Plan->addVF(VF);
8948 
8949   VPlanTransforms::VPInstructionsToVPRecipes(
8950       Plan,
8951       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8952       *PSE.getSE(), *TLI);
8953 
8954   // Remove the existing terminator of the exiting block of the top-most region.
8955   // A BranchOnCount will be added instead when adding the canonical IV recipes.
8956   auto *Term =
8957       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8958   Term->eraseFromParent();
8959 
8960   // Tail folding is not supported for outer loops, so the induction increment
8961   // is guaranteed to not wrap.
8962   bool HasNUW = true;
8963   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8964                         DebugLoc());
8965   return Plan;
8966 }
8967 
8968 // Adjust the recipes for reductions. For in-loop reductions the chain of
8969 // instructions leading from the loop exit instr to the phi need to be converted
8970 // to reductions, with one operand being vector and the other being the scalar
8971 // reduction chain. For other reductions, a select is introduced between the phi
8972 // and live-out recipes when folding the tail.
8973 void LoopVectorizationPlanner::adjustRecipesForReductions(
8974     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8975     ElementCount MinVF) {
8976   VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();
8977   // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8978   // sank outside of the loop would keep the same order as they had in the
8979   // original loop.
8980   SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8981   for (VPRecipeBase &R : Header->phis()) {
8982     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8983       ReductionPHIList.emplace_back(ReductionPhi);
8984   }
8985   bool HasIntermediateStore = false;
8986   stable_sort(ReductionPHIList,
8987               [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8988                                             const VPReductionPHIRecipe *R2) {
8989                 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8990                 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8991                 HasIntermediateStore |= IS1 || IS2;
8992 
8993                 // If neither of the recipes has an intermediate store, keep the
8994                 // order the same.
8995                 if (!IS1 && !IS2)
8996                   return false;
8997 
8998                 // If only one of the recipes has an intermediate store, then
8999                 // move it towards the beginning of the list.
9000                 if (IS1 && !IS2)
9001                   return true;
9002 
9003                 if (!IS1 && IS2)
9004                   return false;
9005 
9006                 // If both recipes have an intermediate store, then the recipe
9007                 // with the later store should be processed earlier. So it
9008                 // should go to the beginning of the list.
9009                 return DT->dominates(IS2, IS1);
9010               });
9011 
9012   if (HasIntermediateStore && ReductionPHIList.size() > 1)
9013     for (VPRecipeBase *R : ReductionPHIList)
9014       R->moveBefore(*Header, Header->getFirstNonPhi());
9015 
9016   SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;
9017   for (VPRecipeBase &R : Header->phis()) {
9018     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9019     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9020       continue;
9021     InLoopReductionPhis.push_back(PhiR);
9022   }
9023 
9024   for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) {
9025     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9026     RecurKind Kind = RdxDesc.getRecurrenceKind();
9027     assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9028            "AnyOf reductions are not allowed for in-loop reductions");
9029 
9030     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9031     SetVector<VPRecipeBase *> Worklist;
9032     Worklist.insert(PhiR);
9033     for (unsigned I = 0; I != Worklist.size(); ++I) {
9034       VPRecipeBase *Cur = Worklist[I];
9035       for (VPUser *U : Cur->getVPSingleValue()->users()) {
9036         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
9037         if (!UserRecipe)
9038           continue;
9039         assert(UserRecipe->getNumDefinedValues() == 1 &&
9040                "recipes must define exactly one result value");
9041         Worklist.insert(UserRecipe);
9042       }
9043     }
9044 
9045     // Visit operation "Links" along the reduction chain top-down starting from
9046     // the phi until LoopExitValue. We keep track of the previous item
9047     // (PreviousLink) to tell which of the two operands of a Link will remain
9048     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9049     // the select instructions.
9050     VPRecipeBase *PreviousLink = PhiR; // Aka Worklist[0].
9051     for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) {
9052       VPValue *PreviousLinkV = PreviousLink->getVPSingleValue();
9053 
9054       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9055 
9056       // Index of the first operand which holds a non-mask vector operand.
9057       unsigned IndexOfFirstOperand;
9058       // Recognize a call to the llvm.fmuladd intrinsic.
9059       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9060       VPValue *VecOp;
9061       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9062       if (IsFMulAdd) {
9063         assert(
9064             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9065             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9066         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9067                 isa<VPWidenCallRecipe>(CurrentLink)) &&
9068                CurrentLink->getOperand(2) == PreviousLinkV &&
9069                "expected a call where the previous link is the added operand");
9070 
9071         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9072         // need to create an fmul recipe (multiplying the first two operands of
9073         // the fmuladd together) to use as the vector operand for the fadd
9074         // reduction.
9075         VPInstruction *FMulRecipe = new VPInstruction(
9076             Instruction::FMul,
9077             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9078             CurrentLinkI->getFastMathFlags());
9079         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9080         VecOp = FMulRecipe;
9081       } else {
9082         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9083           if (isa<VPWidenRecipe>(CurrentLink)) {
9084             assert(isa<CmpInst>(CurrentLinkI) &&
9085                    "need to have the compare of the select");
9086             continue;
9087           }
9088           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9089                  "must be a select recipe");
9090           IndexOfFirstOperand = 1;
9091         } else {
9092           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9093                  "Expected to replace a VPWidenSC");
9094           IndexOfFirstOperand = 0;
9095         }
9096         // Note that for non-commutable operands (cmp-selects), the semantics of
9097         // the cmp-select are captured in the recurrence kind.
9098         unsigned VecOpId =
9099             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLinkV
9100                 ? IndexOfFirstOperand + 1
9101                 : IndexOfFirstOperand;
9102         VecOp = CurrentLink->getOperand(VecOpId);
9103         assert(VecOp != PreviousLinkV &&
9104                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9105                                        (VecOpId - IndexOfFirstOperand)) ==
9106                    PreviousLinkV &&
9107                "PreviousLinkV must be the operand other than VecOp");
9108       }
9109 
9110       BasicBlock *BB = CurrentLinkI->getParent();
9111       VPValue *CondOp = nullptr;
9112       if (CM.blockNeedsPredicationForAnyReason(BB)) {
9113         VPBuilder::InsertPointGuard Guard(Builder);
9114         Builder.setInsertPoint(CurrentLink);
9115         CondOp = RecipeBuilder.createBlockInMask(BB, *Plan);
9116       }
9117 
9118       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9119           RdxDesc, CurrentLinkI, PreviousLinkV, VecOp, CondOp);
9120       // Append the recipe to the end of the VPBasicBlock because we need to
9121       // ensure that it comes after all of it's inputs, including CondOp.
9122       // Note that this transformation may leave over dead recipes (including
9123       // CurrentLink), which will be cleaned by a later VPlan transform.
9124       LinkVPBB->appendRecipe(RedRecipe);
9125       CurrentLink->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9126       PreviousLink = RedRecipe;
9127     }
9128   }
9129     Builder.setInsertPoint(&*LatchVPBB->begin());
9130     for (VPRecipeBase &R :
9131          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9132     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9133     if (!PhiR || PhiR->isInLoop())
9134       continue;
9135 
9136     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9137     auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();
9138     // If tail is folded by masking, introduce selects between the phi
9139     // and the live-out instruction of each reduction, at the beginning of the
9140     // dedicated latch block.
9141     if (CM.foldTailByMasking()) {
9142       VPValue *Cond =
9143           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
9144       VPValue *Red = PhiR->getBackedgeValue();
9145       assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
9146              "reduction recipe must be defined before latch");
9147       FastMathFlags FMFs = RdxDesc.getFastMathFlags();
9148       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9149       Result =
9150           PhiTy->isFloatingPointTy()
9151               ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
9152               : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
9153       Result->insertBefore(&*Builder.getInsertPoint());
9154       Red->replaceUsesWithIf(
9155           Result->getVPSingleValue(),
9156           [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
9157       if (PreferPredicatedReductionSelect ||
9158           TTI.preferPredicatedReductionSelect(
9159               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9160               TargetTransformInfo::ReductionFlags()))
9161         PhiR->setOperand(1, Result->getVPSingleValue());
9162     }
9163     // If the vector reduction can be performed in a smaller type, we truncate
9164     // then extend the loop exit value to enable InstCombine to evaluate the
9165     // entire expression in the smaller type.
9166     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9167     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9168       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9169       Type *RdxTy = RdxDesc.getRecurrenceType();
9170       auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,
9171                                           Result->getVPSingleValue(), RdxTy);
9172       auto *Extnd =
9173           RdxDesc.isSigned()
9174               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9175               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9176 
9177       Trunc->insertAfter(Result);
9178       Extnd->insertAfter(Trunc);
9179       Result->getVPSingleValue()->replaceAllUsesWith(Extnd);
9180       Trunc->setOperand(0, Result->getVPSingleValue());
9181     }
9182   }
9183 
9184   VPlanTransforms::clearReductionWrapFlags(*Plan);
9185 }
9186 
9187 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9188 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9189                                VPSlotTracker &SlotTracker) const {
9190   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9191   IG->getInsertPos()->printAsOperand(O, false);
9192   O << ", ";
9193   getAddr()->printAsOperand(O, SlotTracker);
9194   VPValue *Mask = getMask();
9195   if (Mask) {
9196     O << ", ";
9197     Mask->printAsOperand(O, SlotTracker);
9198   }
9199 
9200   unsigned OpIdx = 0;
9201   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9202     if (!IG->getMember(i))
9203       continue;
9204     if (getNumStoreOperands() > 0) {
9205       O << "\n" << Indent << "  store ";
9206       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9207       O << " to index " << i;
9208     } else {
9209       O << "\n" << Indent << "  ";
9210       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9211       O << " = load from index " << i;
9212     }
9213     ++OpIdx;
9214   }
9215 }
9216 #endif
9217 
9218 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9219   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9220          "Not a pointer induction according to InductionDescriptor!");
9221   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9222          "Unexpected type.");
9223 
9224   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9225   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9226 
9227   if (onlyScalarsGenerated(State.VF)) {
9228     // This is the normalized GEP that starts counting at zero.
9229     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9230         CanonicalIV, IndDesc.getStep()->getType());
9231     // Determine the number of scalars we need to generate for each unroll
9232     // iteration. If the instruction is uniform, we only need to generate the
9233     // first lane. Otherwise, we generate all VF values.
9234     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9235     assert((IsUniform || !State.VF.isScalable()) &&
9236            "Cannot scalarize a scalable VF");
9237     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9238 
9239     for (unsigned Part = 0; Part < State.UF; ++Part) {
9240       Value *PartStart =
9241           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9242 
9243       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9244         Value *Idx = State.Builder.CreateAdd(
9245             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9246         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9247 
9248         Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
9249         Value *SclrGep = emitTransformedIndex(
9250             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9251             IndDesc.getKind(), IndDesc.getInductionBinOp());
9252         SclrGep->setName("next.gep");
9253         State.set(this, SclrGep, VPIteration(Part, Lane));
9254       }
9255     }
9256     return;
9257   }
9258 
9259   Type *PhiType = IndDesc.getStep()->getType();
9260 
9261   // Build a pointer phi
9262   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9263   Type *ScStValueType = ScalarStartValue->getType();
9264   PHINode *NewPointerPhi =
9265       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9266 
9267   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9268   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9269 
9270   // A pointer induction, performed by using a gep
9271   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9272 
9273   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9274   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9275   Value *NumUnrolledElems =
9276       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9277   Value *InductionGEP = GetElementPtrInst::Create(
9278       State.Builder.getInt8Ty(), NewPointerPhi,
9279       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9280       InductionLoc);
9281   // Add induction update using an incorrect block temporarily. The phi node
9282   // will be fixed after VPlan execution. Note that at this point the latch
9283   // block cannot be used, as it does not exist yet.
9284   // TODO: Model increment value in VPlan, by turning the recipe into a
9285   // multi-def and a subclass of VPHeaderPHIRecipe.
9286   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9287 
9288   // Create UF many actual address geps that use the pointer
9289   // phi as base and a vectorized version of the step value
9290   // (<step*0, ..., step*N>) as offset.
9291   for (unsigned Part = 0; Part < State.UF; ++Part) {
9292     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9293     Value *StartOffsetScalar =
9294         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9295     Value *StartOffset =
9296         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9297     // Create a vector of consecutive numbers from zero to VF.
9298     StartOffset = State.Builder.CreateAdd(
9299         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9300 
9301     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9302            "scalar step must be the same across all parts");
9303     Value *GEP = State.Builder.CreateGEP(
9304         State.Builder.getInt8Ty(), NewPointerPhi,
9305         State.Builder.CreateMul(
9306             StartOffset,
9307             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9308             "vector.gep"));
9309     State.set(this, GEP, Part);
9310   }
9311 }
9312 
9313 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9314   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9315 
9316   // Fast-math-flags propagate from the original induction instruction.
9317   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9318   if (FPBinOp)
9319     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9320 
9321   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9322   Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9323   Value *DerivedIV = emitTransformedIndex(
9324       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9325       Kind, cast_if_present<BinaryOperator>(FPBinOp));
9326   DerivedIV->setName("offset.idx");
9327   if (TruncResultTy) {
9328     assert(TruncResultTy != DerivedIV->getType() &&
9329            Step->getType()->isIntegerTy() &&
9330            "Truncation requires an integer step");
9331     DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
9332   }
9333   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9334 
9335   State.set(this, DerivedIV, VPIteration(0, 0));
9336 }
9337 
9338 void VPInterleaveRecipe::execute(VPTransformState &State) {
9339   assert(!State.Instance && "Interleave group being replicated.");
9340   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9341                                       getStoredValues(), getMask(),
9342                                       NeedsMaskForGaps);
9343 }
9344 
9345 void VPReductionRecipe::execute(VPTransformState &State) {
9346   assert(!State.Instance && "Reduction being replicated.");
9347   Value *PrevInChain = State.get(getChainOp(), 0);
9348   RecurKind Kind = RdxDesc.getRecurrenceKind();
9349   bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
9350   // Propagate the fast-math flags carried by the underlying instruction.
9351   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9352   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
9353   for (unsigned Part = 0; Part < State.UF; ++Part) {
9354     Value *NewVecOp = State.get(getVecOp(), Part);
9355     if (VPValue *Cond = getCondOp()) {
9356       Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
9357                                            : State.get(Cond, {Part, 0});
9358       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
9359       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
9360       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
9361                                                   RdxDesc.getFastMathFlags());
9362       if (State.VF.isVector()) {
9363         Iden =
9364             State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9365       }
9366 
9367       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
9368       NewVecOp = Select;
9369     }
9370     Value *NewRed;
9371     Value *NextInChain;
9372     if (IsOrdered) {
9373       if (State.VF.isVector())
9374         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
9375                                         PrevInChain);
9376       else
9377         NewRed = State.Builder.CreateBinOp(
9378             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
9379             NewVecOp);
9380       PrevInChain = NewRed;
9381     } else {
9382       PrevInChain = State.get(getChainOp(), Part);
9383       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
9384     }
9385     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9386       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
9387                                    NewRed, PrevInChain);
9388     } else if (IsOrdered)
9389       NextInChain = NewRed;
9390     else
9391       NextInChain = State.Builder.CreateBinOp(
9392           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9393     State.set(this, NextInChain, Part);
9394   }
9395 }
9396 
9397 void VPReplicateRecipe::execute(VPTransformState &State) {
9398   Instruction *UI = getUnderlyingInstr();
9399   if (State.Instance) { // Generate a single instance.
9400     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9401     State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9402     // Insert scalar instance packing it into a vector.
9403     if (State.VF.isVector() && shouldPack()) {
9404       // If we're constructing lane 0, initialize to start from poison.
9405       if (State.Instance->Lane.isFirstLane()) {
9406         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9407         Value *Poison = PoisonValue::get(
9408             VectorType::get(UI->getType(), State.VF));
9409         State.set(this, Poison, State.Instance->Part);
9410       }
9411       State.packScalarIntoVectorValue(this, *State.Instance);
9412     }
9413     return;
9414   }
9415 
9416   if (IsUniform) {
9417     // If the recipe is uniform across all parts (instead of just per VF), only
9418     // generate a single instance.
9419     if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9420         all_of(operands(), [](VPValue *Op) {
9421           return Op->isDefinedOutsideVectorRegions();
9422         })) {
9423       State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9424       if (user_begin() != user_end()) {
9425         for (unsigned Part = 1; Part < State.UF; ++Part)
9426           State.set(this, State.get(this, VPIteration(0, 0)),
9427                     VPIteration(Part, 0));
9428       }
9429       return;
9430     }
9431 
9432     // Uniform within VL means we need to generate lane 0 only for each
9433     // unrolled copy.
9434     for (unsigned Part = 0; Part < State.UF; ++Part)
9435       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9436     return;
9437   }
9438 
9439   // A store of a loop varying value to a uniform address only needs the last
9440   // copy of the store.
9441   if (isa<StoreInst>(UI) &&
9442       vputils::isUniformAfterVectorization(getOperand(1))) {
9443     auto Lane = VPLane::getLastLaneForVF(State.VF);
9444     State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9445                                     State);
9446     return;
9447   }
9448 
9449   // Generate scalar instances for all VF lanes of all UF parts.
9450   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9451   const unsigned EndLane = State.VF.getKnownMinValue();
9452   for (unsigned Part = 0; Part < State.UF; ++Part)
9453     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9454       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9455 }
9456 
9457 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9458   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9459 
9460   // Attempt to issue a wide load.
9461   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9462   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9463 
9464   assert((LI || SI) && "Invalid Load/Store instruction");
9465   assert((!SI || StoredValue) && "No stored value provided for widened store");
9466   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9467 
9468   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9469 
9470   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9471   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9472   bool CreateGatherScatter = !isConsecutive();
9473 
9474   auto &Builder = State.Builder;
9475   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9476   bool isMaskRequired = getMask();
9477   if (isMaskRequired) {
9478     // Mask reversal is only neede for non-all-one (null) masks, as reverse of a
9479     // null all-one mask is a null mask.
9480     for (unsigned Part = 0; Part < State.UF; ++Part) {
9481       Value *Mask = State.get(getMask(), Part);
9482       if (isReverse())
9483         Mask = Builder.CreateVectorReverse(Mask, "reverse");
9484       BlockInMaskParts[Part] = Mask;
9485     }
9486   }
9487 
9488   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9489     // Calculate the pointer for the specific unroll-part.
9490     Value *PartPtr = nullptr;
9491 
9492     // Use i32 for the gep index type when the value is constant,
9493     // or query DataLayout for a more suitable index type otherwise.
9494     const DataLayout &DL =
9495         Builder.GetInsertBlock()->getModule()->getDataLayout();
9496     Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
9497                         ? DL.getIndexType(PointerType::getUnqual(
9498                               ScalarDataTy->getContext()))
9499                         : Builder.getInt32Ty();
9500     bool InBounds = false;
9501     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9502       InBounds = gep->isInBounds();
9503     if (isReverse()) {
9504       // If the address is consecutive but reversed, then the
9505       // wide store needs to start at the last vector element.
9506       // RunTimeVF =  VScale * VF.getKnownMinValue()
9507       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9508       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
9509       // NumElt = -Part * RunTimeVF
9510       Value *NumElt =
9511           Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
9512       // LastLane = 1 - RunTimeVF
9513       Value *LastLane =
9514           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
9515       PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
9516       PartPtr =
9517           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
9518     } else {
9519       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
9520       PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
9521     }
9522 
9523     return PartPtr;
9524   };
9525 
9526   // Handle Stores:
9527   if (SI) {
9528     State.setDebugLocFrom(SI->getDebugLoc());
9529 
9530     for (unsigned Part = 0; Part < State.UF; ++Part) {
9531       Instruction *NewSI = nullptr;
9532       Value *StoredVal = State.get(StoredValue, Part);
9533       if (CreateGatherScatter) {
9534         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9535         Value *VectorGep = State.get(getAddr(), Part);
9536         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9537                                             MaskPart);
9538       } else {
9539         if (isReverse()) {
9540           // If we store to reverse consecutive memory locations, then we need
9541           // to reverse the order of elements in the stored value.
9542           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9543           // We don't want to update the value in the map as it might be used in
9544           // another expression. So don't call resetVectorValue(StoredVal).
9545         }
9546         auto *VecPtr =
9547             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9548         if (isMaskRequired)
9549           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9550                                             BlockInMaskParts[Part]);
9551         else
9552           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9553       }
9554       State.addMetadata(NewSI, SI);
9555     }
9556     return;
9557   }
9558 
9559   // Handle loads.
9560   assert(LI && "Must have a load instruction");
9561   State.setDebugLocFrom(LI->getDebugLoc());
9562   for (unsigned Part = 0; Part < State.UF; ++Part) {
9563     Value *NewLI;
9564     if (CreateGatherScatter) {
9565       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9566       Value *VectorGep = State.get(getAddr(), Part);
9567       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9568                                          nullptr, "wide.masked.gather");
9569       State.addMetadata(NewLI, LI);
9570     } else {
9571       auto *VecPtr =
9572           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9573       if (isMaskRequired)
9574         NewLI = Builder.CreateMaskedLoad(
9575             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9576             PoisonValue::get(DataTy), "wide.masked.load");
9577       else
9578         NewLI =
9579             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9580 
9581       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9582       State.addMetadata(NewLI, LI);
9583       if (Reverse)
9584         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9585     }
9586 
9587     State.set(getVPSingleValue(), NewLI, Part);
9588   }
9589 }
9590 
9591 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9592 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9593 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9594 // for predication.
9595 static ScalarEpilogueLowering getScalarEpilogueLowering(
9596     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9597     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9598     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9599   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9600   // don't look at hints or options, and don't request a scalar epilogue.
9601   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9602   // LoopAccessInfo (due to code dependency and not being able to reliably get
9603   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9604   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9605   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9606   // back to the old way and vectorize with versioning when forced. See D81345.)
9607   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9608                                                       PGSOQueryType::IRPass) &&
9609                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9610     return CM_ScalarEpilogueNotAllowedOptSize;
9611 
9612   // 2) If set, obey the directives
9613   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9614     switch (PreferPredicateOverEpilogue) {
9615     case PreferPredicateTy::ScalarEpilogue:
9616       return CM_ScalarEpilogueAllowed;
9617     case PreferPredicateTy::PredicateElseScalarEpilogue:
9618       return CM_ScalarEpilogueNotNeededUsePredicate;
9619     case PreferPredicateTy::PredicateOrDontVectorize:
9620       return CM_ScalarEpilogueNotAllowedUsePredicate;
9621     };
9622   }
9623 
9624   // 3) If set, obey the hints
9625   switch (Hints.getPredicate()) {
9626   case LoopVectorizeHints::FK_Enabled:
9627     return CM_ScalarEpilogueNotNeededUsePredicate;
9628   case LoopVectorizeHints::FK_Disabled:
9629     return CM_ScalarEpilogueAllowed;
9630   };
9631 
9632   // 4) if the TTI hook indicates this is profitable, request predication.
9633   TailFoldingInfo TFI(TLI, &LVL, IAI);
9634   if (TTI->preferPredicateOverEpilogue(&TFI))
9635     return CM_ScalarEpilogueNotNeededUsePredicate;
9636 
9637   return CM_ScalarEpilogueAllowed;
9638 }
9639 
9640 // Process the loop in the VPlan-native vectorization path. This path builds
9641 // VPlan upfront in the vectorization pipeline, which allows to apply
9642 // VPlan-to-VPlan transformations from the very beginning without modifying the
9643 // input LLVM IR.
9644 static bool processLoopInVPlanNativePath(
9645     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9646     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9647     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9648     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9649     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9650     LoopVectorizationRequirements &Requirements) {
9651 
9652   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9653     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9654     return false;
9655   }
9656   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9657   Function *F = L->getHeader()->getParent();
9658   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9659 
9660   ScalarEpilogueLowering SEL =
9661       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9662 
9663   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9664                                 &Hints, IAI);
9665   // Use the planner for outer loop vectorization.
9666   // TODO: CM is not used at this point inside the planner. Turn CM into an
9667   // optional argument if we don't need it in the future.
9668   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9669                                ORE);
9670 
9671   // Get user vectorization factor.
9672   ElementCount UserVF = Hints.getWidth();
9673 
9674   CM.collectElementTypesForWidening();
9675 
9676   // Plan how to best vectorize, return the best VF and its cost.
9677   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9678 
9679   // If we are stress testing VPlan builds, do not attempt to generate vector
9680   // code. Masked vector code generation support will follow soon.
9681   // Also, do not attempt to vectorize if no vector code will be produced.
9682   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9683     return false;
9684 
9685   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9686 
9687   {
9688     bool AddBranchWeights =
9689         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9690     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9691                              F->getParent()->getDataLayout(), AddBranchWeights);
9692     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9693                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9694     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9695                       << L->getHeader()->getParent()->getName() << "\"\n");
9696     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9697   }
9698 
9699   reportVectorization(ORE, L, VF, 1);
9700 
9701   // Mark the loop as already vectorized to avoid vectorizing again.
9702   Hints.setAlreadyVectorized();
9703   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9704   return true;
9705 }
9706 
9707 // Emit a remark if there are stores to floats that required a floating point
9708 // extension. If the vectorized loop was generated with floating point there
9709 // will be a performance penalty from the conversion overhead and the change in
9710 // the vector width.
9711 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9712   SmallVector<Instruction *, 4> Worklist;
9713   for (BasicBlock *BB : L->getBlocks()) {
9714     for (Instruction &Inst : *BB) {
9715       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9716         if (S->getValueOperand()->getType()->isFloatTy())
9717           Worklist.push_back(S);
9718       }
9719     }
9720   }
9721 
9722   // Traverse the floating point stores upwards searching, for floating point
9723   // conversions.
9724   SmallPtrSet<const Instruction *, 4> Visited;
9725   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9726   while (!Worklist.empty()) {
9727     auto *I = Worklist.pop_back_val();
9728     if (!L->contains(I))
9729       continue;
9730     if (!Visited.insert(I).second)
9731       continue;
9732 
9733     // Emit a remark if the floating point store required a floating
9734     // point conversion.
9735     // TODO: More work could be done to identify the root cause such as a
9736     // constant or a function return type and point the user to it.
9737     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9738       ORE->emit([&]() {
9739         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9740                                           I->getDebugLoc(), L->getHeader())
9741                << "floating point conversion changes vector width. "
9742                << "Mixed floating point precision requires an up/down "
9743                << "cast that will negatively impact performance.";
9744       });
9745 
9746     for (Use &Op : I->operands())
9747       if (auto *OpI = dyn_cast<Instruction>(Op))
9748         Worklist.push_back(OpI);
9749   }
9750 }
9751 
9752 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9753                                        VectorizationFactor &VF,
9754                                        std::optional<unsigned> VScale, Loop *L,
9755                                        ScalarEvolution &SE,
9756                                        ScalarEpilogueLowering SEL) {
9757   InstructionCost CheckCost = Checks.getCost();
9758   if (!CheckCost.isValid())
9759     return false;
9760 
9761   // When interleaving only scalar and vector cost will be equal, which in turn
9762   // would lead to a divide by 0. Fall back to hard threshold.
9763   if (VF.Width.isScalar()) {
9764     if (CheckCost > VectorizeMemoryCheckThreshold) {
9765       LLVM_DEBUG(
9766           dbgs()
9767           << "LV: Interleaving only is not profitable due to runtime checks\n");
9768       return false;
9769     }
9770     return true;
9771   }
9772 
9773   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9774   double ScalarC = *VF.ScalarCost.getValue();
9775   if (ScalarC == 0)
9776     return true;
9777 
9778   // First, compute the minimum iteration count required so that the vector
9779   // loop outperforms the scalar loop.
9780   //  The total cost of the scalar loop is
9781   //   ScalarC * TC
9782   //  where
9783   //  * TC is the actual trip count of the loop.
9784   //  * ScalarC is the cost of a single scalar iteration.
9785   //
9786   //  The total cost of the vector loop is
9787   //    RtC + VecC * (TC / VF) + EpiC
9788   //  where
9789   //  * RtC is the cost of the generated runtime checks
9790   //  * VecC is the cost of a single vector iteration.
9791   //  * TC is the actual trip count of the loop
9792   //  * VF is the vectorization factor
9793   //  * EpiCost is the cost of the generated epilogue, including the cost
9794   //    of the remaining scalar operations.
9795   //
9796   // Vectorization is profitable once the total vector cost is less than the
9797   // total scalar cost:
9798   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9799   //
9800   // Now we can compute the minimum required trip count TC as
9801   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9802   //
9803   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9804   // the computations are performed on doubles, not integers and the result
9805   // is rounded up, hence we get an upper estimate of the TC.
9806   unsigned IntVF = VF.Width.getKnownMinValue();
9807   if (VF.Width.isScalable()) {
9808     unsigned AssumedMinimumVscale = 1;
9809     if (VScale)
9810       AssumedMinimumVscale = *VScale;
9811     IntVF *= AssumedMinimumVscale;
9812   }
9813   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9814   double RtC = *CheckCost.getValue();
9815   double MinTC1 = RtC / (ScalarC - VecCOverVF);
9816 
9817   // Second, compute a minimum iteration count so that the cost of the
9818   // runtime checks is only a fraction of the total scalar loop cost. This
9819   // adds a loop-dependent bound on the overhead incurred if the runtime
9820   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9821   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9822   // cost, compute
9823   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
9824   double MinTC2 = RtC * 10 / ScalarC;
9825 
9826   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9827   // epilogue is allowed, choose the next closest multiple of VF. This should
9828   // partly compensate for ignoring the epilogue cost.
9829   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9830   if (SEL == CM_ScalarEpilogueAllowed)
9831     MinTC = alignTo(MinTC, IntVF);
9832   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9833 
9834   LLVM_DEBUG(
9835       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9836              << VF.MinProfitableTripCount << "\n");
9837 
9838   // Skip vectorization if the expected trip count is less than the minimum
9839   // required trip count.
9840   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9841     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9842                                 VF.MinProfitableTripCount)) {
9843       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9844                            "trip count < minimum profitable VF ("
9845                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
9846                         << ")\n");
9847 
9848       return false;
9849     }
9850   }
9851   return true;
9852 }
9853 
9854 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9855     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9856                                !EnableLoopInterleaving),
9857       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9858                               !EnableLoopVectorization) {}
9859 
9860 bool LoopVectorizePass::processLoop(Loop *L) {
9861   assert((EnableVPlanNativePath || L->isInnermost()) &&
9862          "VPlan-native path is not enabled. Only process inner loops.");
9863 
9864 #ifndef NDEBUG
9865   const std::string DebugLocStr = getDebugLocString(L);
9866 #endif /* NDEBUG */
9867 
9868   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9869                     << L->getHeader()->getParent()->getName() << "' from "
9870                     << DebugLocStr << "\n");
9871 
9872   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9873 
9874   LLVM_DEBUG(
9875       dbgs() << "LV: Loop hints:"
9876              << " force="
9877              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9878                      ? "disabled"
9879                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9880                             ? "enabled"
9881                             : "?"))
9882              << " width=" << Hints.getWidth()
9883              << " interleave=" << Hints.getInterleave() << "\n");
9884 
9885   // Function containing loop
9886   Function *F = L->getHeader()->getParent();
9887 
9888   // Looking at the diagnostic output is the only way to determine if a loop
9889   // was vectorized (other than looking at the IR or machine code), so it
9890   // is important to generate an optimization remark for each loop. Most of
9891   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9892   // generated as OptimizationRemark and OptimizationRemarkMissed are
9893   // less verbose reporting vectorized loops and unvectorized loops that may
9894   // benefit from vectorization, respectively.
9895 
9896   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9897     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9898     return false;
9899   }
9900 
9901   PredicatedScalarEvolution PSE(*SE, *L);
9902 
9903   // Check if it is legal to vectorize the loop.
9904   LoopVectorizationRequirements Requirements;
9905   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9906                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9907   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9908     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9909     Hints.emitRemarkWithHints();
9910     return false;
9911   }
9912 
9913   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9914   // here. They may require CFG and instruction level transformations before
9915   // even evaluating whether vectorization is profitable. Since we cannot modify
9916   // the incoming IR, we need to build VPlan upfront in the vectorization
9917   // pipeline.
9918   if (!L->isInnermost())
9919     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9920                                         ORE, BFI, PSI, Hints, Requirements);
9921 
9922   assert(L->isInnermost() && "Inner loop expected.");
9923 
9924   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9925   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9926 
9927   // If an override option has been passed in for interleaved accesses, use it.
9928   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9929     UseInterleaved = EnableInterleavedMemAccesses;
9930 
9931   // Analyze interleaved memory accesses.
9932   if (UseInterleaved)
9933     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9934 
9935   // Check the function attributes and profiles to find out if this function
9936   // should be optimized for size.
9937   ScalarEpilogueLowering SEL =
9938       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9939 
9940   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9941   // count by optimizing for size, to minimize overheads.
9942   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9943   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9944     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9945                       << "This loop is worth vectorizing only if no scalar "
9946                       << "iteration overheads are incurred.");
9947     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9948       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9949     else {
9950       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9951         LLVM_DEBUG(dbgs() << "\n");
9952         // Predicate tail-folded loops are efficient even when the loop
9953         // iteration count is low. However, setting the epilogue policy to
9954         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9955         // with runtime checks. It's more effective to let
9956         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9957         // for the loop.
9958         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9959           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9960       } else {
9961         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9962                              "small to consider vectorizing.\n");
9963         reportVectorizationFailure(
9964             "The trip count is below the minial threshold value.",
9965             "loop trip count is too low, avoiding vectorization",
9966             "LowTripCount", ORE, L);
9967         Hints.emitRemarkWithHints();
9968         return false;
9969       }
9970     }
9971   }
9972 
9973   // Check the function attributes to see if implicit floats or vectors are
9974   // allowed.
9975   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9976     reportVectorizationFailure(
9977         "Can't vectorize when the NoImplicitFloat attribute is used",
9978         "loop not vectorized due to NoImplicitFloat attribute",
9979         "NoImplicitFloat", ORE, L);
9980     Hints.emitRemarkWithHints();
9981     return false;
9982   }
9983 
9984   // Check if the target supports potentially unsafe FP vectorization.
9985   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9986   // for the target we're vectorizing for, to make sure none of the
9987   // additional fp-math flags can help.
9988   if (Hints.isPotentiallyUnsafe() &&
9989       TTI->isFPVectorizationPotentiallyUnsafe()) {
9990     reportVectorizationFailure(
9991         "Potentially unsafe FP op prevents vectorization",
9992         "loop not vectorized due to unsafe FP support.",
9993         "UnsafeFP", ORE, L);
9994     Hints.emitRemarkWithHints();
9995     return false;
9996   }
9997 
9998   bool AllowOrderedReductions;
9999   // If the flag is set, use that instead and override the TTI behaviour.
10000   if (ForceOrderedReductions.getNumOccurrences() > 0)
10001     AllowOrderedReductions = ForceOrderedReductions;
10002   else
10003     AllowOrderedReductions = TTI->enableOrderedReductions();
10004   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10005     ORE->emit([&]() {
10006       auto *ExactFPMathInst = Requirements.getExactFPInst();
10007       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10008                                                  ExactFPMathInst->getDebugLoc(),
10009                                                  ExactFPMathInst->getParent())
10010              << "loop not vectorized: cannot prove it is safe to reorder "
10011                 "floating-point operations";
10012     });
10013     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10014                          "reorder floating-point operations\n");
10015     Hints.emitRemarkWithHints();
10016     return false;
10017   }
10018 
10019   // Use the cost model.
10020   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10021                                 F, &Hints, IAI);
10022   // Use the planner for vectorization.
10023   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10024                                ORE);
10025 
10026   // Get user vectorization factor and interleave count.
10027   ElementCount UserVF = Hints.getWidth();
10028   unsigned UserIC = Hints.getInterleave();
10029 
10030   // Plan how to best vectorize, return the best VF and its cost.
10031   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10032 
10033   VectorizationFactor VF = VectorizationFactor::Disabled();
10034   unsigned IC = 1;
10035 
10036   bool AddBranchWeights =
10037       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10038   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10039                            F->getParent()->getDataLayout(), AddBranchWeights);
10040   if (MaybeVF) {
10041     VF = *MaybeVF;
10042     // Select the interleave count.
10043     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10044 
10045     unsigned SelectedIC = std::max(IC, UserIC);
10046     //  Optimistically generate runtime checks if they are needed. Drop them if
10047     //  they turn out to not be profitable.
10048     if (VF.Width.isVector() || SelectedIC > 1)
10049       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10050 
10051     // Check if it is profitable to vectorize with runtime checks.
10052     bool ForceVectorization =
10053         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10054     if (!ForceVectorization &&
10055         !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
10056                                     *PSE.getSE(), SEL)) {
10057       ORE->emit([&]() {
10058         return OptimizationRemarkAnalysisAliasing(
10059                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10060                    L->getHeader())
10061                << "loop not vectorized: cannot prove it is safe to reorder "
10062                   "memory operations";
10063       });
10064       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10065       Hints.emitRemarkWithHints();
10066       return false;
10067     }
10068   }
10069 
10070   // Identify the diagnostic messages that should be produced.
10071   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10072   bool VectorizeLoop = true, InterleaveLoop = true;
10073   if (VF.Width.isScalar()) {
10074     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10075     VecDiagMsg = std::make_pair(
10076         "VectorizationNotBeneficial",
10077         "the cost-model indicates that vectorization is not beneficial");
10078     VectorizeLoop = false;
10079   }
10080 
10081   if (!MaybeVF && UserIC > 1) {
10082     // Tell the user interleaving was avoided up-front, despite being explicitly
10083     // requested.
10084     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10085                          "interleaving should be avoided up front\n");
10086     IntDiagMsg = std::make_pair(
10087         "InterleavingAvoided",
10088         "Ignoring UserIC, because interleaving was avoided up front");
10089     InterleaveLoop = false;
10090   } else if (IC == 1 && UserIC <= 1) {
10091     // Tell the user interleaving is not beneficial.
10092     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10093     IntDiagMsg = std::make_pair(
10094         "InterleavingNotBeneficial",
10095         "the cost-model indicates that interleaving is not beneficial");
10096     InterleaveLoop = false;
10097     if (UserIC == 1) {
10098       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10099       IntDiagMsg.second +=
10100           " and is explicitly disabled or interleave count is set to 1";
10101     }
10102   } else if (IC > 1 && UserIC == 1) {
10103     // Tell the user interleaving is beneficial, but it explicitly disabled.
10104     LLVM_DEBUG(
10105         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10106     IntDiagMsg = std::make_pair(
10107         "InterleavingBeneficialButDisabled",
10108         "the cost-model indicates that interleaving is beneficial "
10109         "but is explicitly disabled or interleave count is set to 1");
10110     InterleaveLoop = false;
10111   }
10112 
10113   // Override IC if user provided an interleave count.
10114   IC = UserIC > 0 ? UserIC : IC;
10115 
10116   // Emit diagnostic messages, if any.
10117   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10118   if (!VectorizeLoop && !InterleaveLoop) {
10119     // Do not vectorize or interleaving the loop.
10120     ORE->emit([&]() {
10121       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10122                                       L->getStartLoc(), L->getHeader())
10123              << VecDiagMsg.second;
10124     });
10125     ORE->emit([&]() {
10126       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10127                                       L->getStartLoc(), L->getHeader())
10128              << IntDiagMsg.second;
10129     });
10130     return false;
10131   } else if (!VectorizeLoop && InterleaveLoop) {
10132     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10133     ORE->emit([&]() {
10134       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10135                                         L->getStartLoc(), L->getHeader())
10136              << VecDiagMsg.second;
10137     });
10138   } else if (VectorizeLoop && !InterleaveLoop) {
10139     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10140                       << ") in " << DebugLocStr << '\n');
10141     ORE->emit([&]() {
10142       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10143                                         L->getStartLoc(), L->getHeader())
10144              << IntDiagMsg.second;
10145     });
10146   } else if (VectorizeLoop && InterleaveLoop) {
10147     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10148                       << ") in " << DebugLocStr << '\n');
10149     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10150   }
10151 
10152   bool DisableRuntimeUnroll = false;
10153   MDNode *OrigLoopID = L->getLoopID();
10154   {
10155     using namespace ore;
10156     if (!VectorizeLoop) {
10157       assert(IC > 1 && "interleave count should not be 1 or 0");
10158       // If we decided that it is not legal to vectorize the loop, then
10159       // interleave it.
10160       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10161                                  &CM, BFI, PSI, Checks);
10162 
10163       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10164       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10165 
10166       ORE->emit([&]() {
10167         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10168                                   L->getHeader())
10169                << "interleaved loop (interleaved count: "
10170                << NV("InterleaveCount", IC) << ")";
10171       });
10172     } else {
10173       // If we decided that it is *legal* to vectorize the loop, then do it.
10174 
10175       // Consider vectorizing the epilogue too if it's profitable.
10176       VectorizationFactor EpilogueVF =
10177           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10178       if (EpilogueVF.Width.isVector()) {
10179 
10180         // The first pass vectorizes the main loop and creates a scalar epilogue
10181         // to be vectorized by executing the plan (potentially with a different
10182         // factor) again shortly afterwards.
10183         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10184         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10185                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10186 
10187         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10188         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10189                                              BestMainPlan, MainILV, DT, true);
10190         ++LoopsVectorized;
10191 
10192         // Second pass vectorizes the epilogue and adjusts the control flow
10193         // edges from the first pass.
10194         EPI.MainLoopVF = EPI.EpilogueVF;
10195         EPI.MainLoopUF = EPI.EpilogueUF;
10196         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10197                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10198                                                  Checks);
10199 
10200         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10201         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10202         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10203         Header->setName("vec.epilog.vector.body");
10204 
10205         // Re-use the trip count and steps expanded for the main loop, as
10206         // skeleton creation needs it as a value that dominates both the scalar
10207         // and vector epilogue loops
10208         // TODO: This is a workaround needed for epilogue vectorization and it
10209         // should be removed once induction resume value creation is done
10210         // directly in VPlan.
10211         EpilogILV.setTripCount(MainILV.getTripCount());
10212         for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10213           auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10214           auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
10215               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10216           ExpandR->replaceAllUsesWith(ExpandedVal);
10217           ExpandR->eraseFromParent();
10218         }
10219 
10220         // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10221         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10222         // before vectorizing the epilogue loop.
10223         for (VPRecipeBase &R : Header->phis()) {
10224           if (isa<VPCanonicalIVPHIRecipe>(&R))
10225             continue;
10226 
10227           Value *ResumeV = nullptr;
10228           // TODO: Move setting of resume values to prepareToExecute.
10229           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10230             ResumeV = MainILV.getReductionResumeValue(
10231                 ReductionPhi->getRecurrenceDescriptor());
10232           } else {
10233             // Create induction resume values for both widened pointer and
10234             // integer/fp inductions and update the start value of the induction
10235             // recipes to use the resume value.
10236             PHINode *IndPhi = nullptr;
10237             const InductionDescriptor *ID;
10238             if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10239               IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10240               ID = &Ind->getInductionDescriptor();
10241             } else {
10242               auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10243               IndPhi = WidenInd->getPHINode();
10244               ID = &WidenInd->getInductionDescriptor();
10245             }
10246 
10247             ResumeV = MainILV.createInductionResumeValue(
10248                 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10249                 {EPI.MainLoopIterationCountCheck});
10250           }
10251           assert(ResumeV && "Must have a resume value");
10252           VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
10253           cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10254         }
10255 
10256         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10257                         DT, true, &ExpandedSCEVs);
10258         ++LoopsEpilogueVectorized;
10259 
10260         if (!MainILV.areSafetyChecksAdded())
10261           DisableRuntimeUnroll = true;
10262       } else {
10263         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10264                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10265                                PSI, Checks);
10266 
10267         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10268         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10269         ++LoopsVectorized;
10270 
10271         // Add metadata to disable runtime unrolling a scalar loop when there
10272         // are no runtime checks about strides and memory. A scalar loop that is
10273         // rarely used is not worth unrolling.
10274         if (!LB.areSafetyChecksAdded())
10275           DisableRuntimeUnroll = true;
10276       }
10277       // Report the vectorization decision.
10278       reportVectorization(ORE, L, VF, IC);
10279     }
10280 
10281     if (ORE->allowExtraAnalysis(LV_NAME))
10282       checkMixedPrecision(L, ORE);
10283   }
10284 
10285   std::optional<MDNode *> RemainderLoopID =
10286       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10287                                       LLVMLoopVectorizeFollowupEpilogue});
10288   if (RemainderLoopID) {
10289     L->setLoopID(*RemainderLoopID);
10290   } else {
10291     if (DisableRuntimeUnroll)
10292       AddRuntimeUnrollDisableMetaData(L);
10293 
10294     // Mark the loop as already vectorized to avoid vectorizing again.
10295     Hints.setAlreadyVectorized();
10296   }
10297 
10298   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10299   return true;
10300 }
10301 
10302 LoopVectorizeResult LoopVectorizePass::runImpl(
10303     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10304     DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10305     DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10306     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10307   SE = &SE_;
10308   LI = &LI_;
10309   TTI = &TTI_;
10310   DT = &DT_;
10311   BFI = BFI_;
10312   TLI = TLI_;
10313   AC = &AC_;
10314   LAIs = &LAIs_;
10315   DB = &DB_;
10316   ORE = &ORE_;
10317   PSI = PSI_;
10318 
10319   // Don't attempt if
10320   // 1. the target claims to have no vector registers, and
10321   // 2. interleaving won't help ILP.
10322   //
10323   // The second condition is necessary because, even if the target has no
10324   // vector registers, loop vectorization may still enable scalar
10325   // interleaving.
10326   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10327       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10328     return LoopVectorizeResult(false, false);
10329 
10330   bool Changed = false, CFGChanged = false;
10331 
10332   // The vectorizer requires loops to be in simplified form.
10333   // Since simplification may add new inner loops, it has to run before the
10334   // legality and profitability checks. This means running the loop vectorizer
10335   // will simplify all loops, regardless of whether anything end up being
10336   // vectorized.
10337   for (const auto &L : *LI)
10338     Changed |= CFGChanged |=
10339         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10340 
10341   // Build up a worklist of inner-loops to vectorize. This is necessary as
10342   // the act of vectorizing or partially unrolling a loop creates new loops
10343   // and can invalidate iterators across the loops.
10344   SmallVector<Loop *, 8> Worklist;
10345 
10346   for (Loop *L : *LI)
10347     collectSupportedLoops(*L, LI, ORE, Worklist);
10348 
10349   LoopsAnalyzed += Worklist.size();
10350 
10351   // Now walk the identified inner loops.
10352   while (!Worklist.empty()) {
10353     Loop *L = Worklist.pop_back_val();
10354 
10355     // For the inner loops we actually process, form LCSSA to simplify the
10356     // transform.
10357     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10358 
10359     Changed |= CFGChanged |= processLoop(L);
10360 
10361     if (Changed) {
10362       LAIs->clear();
10363 
10364 #ifndef NDEBUG
10365       if (VerifySCEV)
10366         SE->verify();
10367 #endif
10368     }
10369   }
10370 
10371   // Process each loop nest in the function.
10372   return LoopVectorizeResult(Changed, CFGChanged);
10373 }
10374 
10375 PreservedAnalyses LoopVectorizePass::run(Function &F,
10376                                          FunctionAnalysisManager &AM) {
10377     auto &LI = AM.getResult<LoopAnalysis>(F);
10378     // There are no loops in the function. Return before computing other expensive
10379     // analyses.
10380     if (LI.empty())
10381       return PreservedAnalyses::all();
10382     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10383     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10384     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10385     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10386     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10387     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10388     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10389 
10390     LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10391     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10392     ProfileSummaryInfo *PSI =
10393         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10394     BlockFrequencyInfo *BFI = nullptr;
10395     if (PSI && PSI->hasProfileSummary())
10396       BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10397     LoopVectorizeResult Result =
10398         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10399     if (!Result.MadeAnyChange)
10400       return PreservedAnalyses::all();
10401     PreservedAnalyses PA;
10402 
10403     if (isAssignmentTrackingEnabled(*F.getParent())) {
10404       for (auto &BB : F)
10405         RemoveRedundantDbgInstrs(&BB);
10406     }
10407 
10408     // We currently do not preserve loopinfo/dominator analyses with outer loop
10409     // vectorization. Until this is addressed, mark these analyses as preserved
10410     // only for non-VPlan-native path.
10411     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10412     if (!EnableVPlanNativePath) {
10413       PA.preserve<LoopAnalysis>();
10414       PA.preserve<DominatorTreeAnalysis>();
10415       PA.preserve<ScalarEvolutionAnalysis>();
10416     }
10417 
10418     if (Result.MadeCFGChange) {
10419       // Making CFG changes likely means a loop got vectorized. Indicate that
10420       // extra simplification passes should be run.
10421       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10422       // be run if runtime checks have been added.
10423       AM.getResult<ShouldRunExtraVectorPasses>(F);
10424       PA.preserve<ShouldRunExtraVectorPasses>();
10425     } else {
10426       PA.preserveSet<CFGAnalyses>();
10427     }
10428     return PA;
10429 }
10430 
10431 void LoopVectorizePass::printPipeline(
10432     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10433   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10434       OS, MapClassName2PassName);
10435 
10436   OS << '<';
10437   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10438   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10439   OS << '>';
10440 }
10441