xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 7a6dacaca14b62ca4b74406814becb87a3fefac0)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/STLExtras.h"
70 #include "llvm/ADT/SmallPtrSet.h"
71 #include "llvm/ADT/SmallSet.h"
72 #include "llvm/ADT/SmallVector.h"
73 #include "llvm/ADT/Statistic.h"
74 #include "llvm/ADT/StringRef.h"
75 #include "llvm/ADT/Twine.h"
76 #include "llvm/ADT/iterator_range.h"
77 #include "llvm/Analysis/AssumptionCache.h"
78 #include "llvm/Analysis/BasicAliasAnalysis.h"
79 #include "llvm/Analysis/BlockFrequencyInfo.h"
80 #include "llvm/Analysis/CFG.h"
81 #include "llvm/Analysis/CodeMetrics.h"
82 #include "llvm/Analysis/DemandedBits.h"
83 #include "llvm/Analysis/GlobalsModRef.h"
84 #include "llvm/Analysis/LoopAccessAnalysis.h"
85 #include "llvm/Analysis/LoopAnalysisManager.h"
86 #include "llvm/Analysis/LoopInfo.h"
87 #include "llvm/Analysis/LoopIterator.h"
88 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
89 #include "llvm/Analysis/ProfileSummaryInfo.h"
90 #include "llvm/Analysis/ScalarEvolution.h"
91 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
92 #include "llvm/Analysis/TargetLibraryInfo.h"
93 #include "llvm/Analysis/TargetTransformInfo.h"
94 #include "llvm/Analysis/ValueTracking.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfo.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/MDBuilder.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/ProfDataUtils.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cmath>
146 #include <cstdint>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <map>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(
251             TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252             "data-and-control-without-rt-check",
253             "Similar to data-and-control, but remove the runtime check")));
254 
255 static cl::opt<bool> MaximizeBandwidth(
256     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
257     cl::desc("Maximize bandwidth when selecting vectorization factor which "
258              "will be determined by the smallest type in loop."));
259 
260 static cl::opt<bool> EnableInterleavedMemAccesses(
261     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
262     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
263 
264 /// An interleave-group may need masking if it resides in a block that needs
265 /// predication, or in order to mask away gaps.
266 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
267     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
269 
270 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272     cl::desc("We don't interleave loops with a estimated constant trip count "
273              "below this number"));
274 
275 static cl::opt<unsigned> ForceTargetNumScalarRegs(
276     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's number of scalar registers."));
278 
279 static cl::opt<unsigned> ForceTargetNumVectorRegs(
280     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
281     cl::desc("A flag that overrides the target's number of vector registers."));
282 
283 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
284     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
285     cl::desc("A flag that overrides the target's max interleave factor for "
286              "scalar loops."));
287 
288 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
289     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
290     cl::desc("A flag that overrides the target's max interleave factor for "
291              "vectorized loops."));
292 
293 static cl::opt<unsigned> ForceTargetInstructionCost(
294     "force-target-instruction-cost", cl::init(0), cl::Hidden,
295     cl::desc("A flag that overrides the target's expected cost for "
296              "an instruction to a single constant value. Mostly "
297              "useful for getting consistent testing."));
298 
299 static cl::opt<bool> ForceTargetSupportsScalableVectors(
300     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
301     cl::desc(
302         "Pretend that scalable vectors are supported, even if the target does "
303         "not support them. This flag should only be used for testing."));
304 
305 static cl::opt<unsigned> SmallLoopCost(
306     "small-loop-cost", cl::init(20), cl::Hidden,
307     cl::desc(
308         "The cost of a loop that is considered 'small' by the interleaver."));
309 
310 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
311     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
312     cl::desc("Enable the use of the block frequency analysis to access PGO "
313              "heuristics minimizing code growth in cold regions and being more "
314              "aggressive in hot regions."));
315 
316 // Runtime interleave loops for load/store throughput.
317 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
318     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
319     cl::desc(
320         "Enable runtime interleaving until load/store ports are saturated"));
321 
322 /// Interleave small loops with scalar reductions.
323 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
324     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
325     cl::desc("Enable interleaving for loops with small iteration counts that "
326              "contain scalar reductions to expose ILP."));
327 
328 /// The number of stores in a loop that are allowed to need predication.
329 static cl::opt<unsigned> NumberOfStoresToPredicate(
330     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
331     cl::desc("Max number of stores to be predicated behind an if."));
332 
333 static cl::opt<bool> EnableIndVarRegisterHeur(
334     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
335     cl::desc("Count the induction variable only once when interleaving"));
336 
337 static cl::opt<bool> EnableCondStoresVectorization(
338     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
339     cl::desc("Enable if predication of stores during vectorization."));
340 
341 static cl::opt<unsigned> MaxNestedScalarReductionIC(
342     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
343     cl::desc("The maximum interleave count to use when interleaving a scalar "
344              "reduction in a nested loop."));
345 
346 static cl::opt<bool>
347     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
348                            cl::Hidden,
349                            cl::desc("Prefer in-loop vector reductions, "
350                                     "overriding the targets preference."));
351 
352 static cl::opt<bool> ForceOrderedReductions(
353     "force-ordered-reductions", cl::init(false), cl::Hidden,
354     cl::desc("Enable the vectorisation of loops with in-order (strict) "
355              "FP reductions"));
356 
357 static cl::opt<bool> PreferPredicatedReductionSelect(
358     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
359     cl::desc(
360         "Prefer predicating a reduction operation over an after loop select."));
361 
362 namespace llvm {
363 cl::opt<bool> EnableVPlanNativePath(
364     "enable-vplan-native-path", cl::Hidden,
365     cl::desc("Enable VPlan-native vectorization path with "
366              "support for outer loop vectorization."));
367 }
368 
369 // This flag enables the stress testing of the VPlan H-CFG construction in the
370 // VPlan-native vectorization path. It must be used in conjuction with
371 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372 // verification of the H-CFGs built.
373 static cl::opt<bool> VPlanBuildStressTest(
374     "vplan-build-stress-test", cl::init(false), cl::Hidden,
375     cl::desc(
376         "Build VPlan for every supported loop nest in the function and bail "
377         "out right after the build (stress test the VPlan H-CFG construction "
378         "in the VPlan-native vectorization path)."));
379 
380 cl::opt<bool> llvm::EnableLoopInterleaving(
381     "interleave-loops", cl::init(true), cl::Hidden,
382     cl::desc("Enable loop interleaving in Loop vectorization passes"));
383 cl::opt<bool> llvm::EnableLoopVectorization(
384     "vectorize-loops", cl::init(true), cl::Hidden,
385     cl::desc("Run the Loop vectorization passes"));
386 
387 static cl::opt<bool> PrintVPlansInDotFormat(
388     "vplan-print-in-dot-format", cl::Hidden,
389     cl::desc("Use dot format instead of plain text when dumping VPlans"));
390 
391 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
392     "force-widen-divrem-via-safe-divisor", cl::Hidden,
393     cl::desc(
394         "Override cost based safe divisor widening for div/rem instructions"));
395 
396 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
397     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398     cl::Hidden,
399     cl::desc("Try wider VFs if they enable the use of vector variants"));
400 
401 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402 // variables not overflowing do not hold. See `emitSCEVChecks`.
403 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
405 // `emitMemRuntimeChecks`.
406 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407 // Likelyhood of bypassing the vectorized loop because there are zero trips left
408 // after prolog. See `emitIterationCountCheck`.
409 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410 
411 /// A helper function that returns true if the given type is irregular. The
412 /// type is irregular if its allocated size doesn't equal the store size of an
413 /// element of the corresponding vector type.
414 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
415   // Determine if an array of N elements of type Ty is "bitcast compatible"
416   // with a <N x Ty> vector.
417   // This is only true if there is no padding between the array elements.
418   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
419 }
420 
421 /// A helper function that returns the reciprocal of the block probability of
422 /// predicated blocks. If we return X, we are assuming the predicated block
423 /// will execute once for every X iterations of the loop header.
424 ///
425 /// TODO: We should use actual block probability here, if available. Currently,
426 ///       we always assume predicated blocks have a 50% chance of executing.
427 static unsigned getReciprocalPredBlockProb() { return 2; }
428 
429 /// Returns "best known" trip count for the specified loop \p L as defined by
430 /// the following procedure:
431 ///   1) Returns exact trip count if it is known.
432 ///   2) Returns expected trip count according to profile data if any.
433 ///   3) Returns upper bound estimate if it is known.
434 ///   4) Returns std::nullopt if all of the above failed.
435 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
436                                                    Loop *L) {
437   // Check if exact trip count is known.
438   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
439     return ExpectedTC;
440 
441   // Check if there is an expected trip count available from profile data.
442   if (LoopVectorizeWithBlockFrequency)
443     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
444       return *EstimatedTC;
445 
446   // Check if upper bound estimate is known.
447   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
448     return ExpectedTC;
449 
450   return std::nullopt;
451 }
452 
453 /// Return a vector containing interleaved elements from multiple
454 /// smaller input vectors.
455 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
456                                 const Twine &Name) {
457   unsigned Factor = Vals.size();
458   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
459 
460   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
461 #ifndef NDEBUG
462   for (Value *Val : Vals)
463     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
464 #endif
465 
466   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
467   // must use intrinsics to interleave.
468   if (VecTy->isScalableTy()) {
469     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
470     return Builder.CreateIntrinsic(
471         WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
472         /*FMFSource=*/nullptr, Name);
473   }
474 
475   // Fixed length. Start by concatenating all vectors into a wide vector.
476   Value *WideVec = concatenateVectors(Builder, Vals);
477 
478   // Interleave the elements into the wide vector.
479   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
480   return Builder.CreateShuffleVector(
481       WideVec, createInterleaveMask(NumElts, Factor), Name);
482 }
483 
484 namespace {
485 // Forward declare GeneratedRTChecks.
486 class GeneratedRTChecks;
487 
488 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489 } // namespace
490 
491 namespace llvm {
492 
493 AnalysisKey ShouldRunExtraVectorPasses::Key;
494 
495 /// InnerLoopVectorizer vectorizes loops which contain only one basic
496 /// block to a specified vectorization factor (VF).
497 /// This class performs the widening of scalars into vectors, or multiple
498 /// scalars. This class also implements the following features:
499 /// * It inserts an epilogue loop for handling loops that don't have iteration
500 ///   counts that are known to be a multiple of the vectorization factor.
501 /// * It handles the code generation for reduction variables.
502 /// * Scalarization (implementation using scalars) of un-vectorizable
503 ///   instructions.
504 /// InnerLoopVectorizer does not perform any vectorization-legality
505 /// checks, and relies on the caller to check for the different legality
506 /// aspects. The InnerLoopVectorizer relies on the
507 /// LoopVectorizationLegality class to provide information about the induction
508 /// and reduction variables that were found to a given vectorization factor.
509 class InnerLoopVectorizer {
510 public:
511   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
512                       LoopInfo *LI, DominatorTree *DT,
513                       const TargetLibraryInfo *TLI,
514                       const TargetTransformInfo *TTI, AssumptionCache *AC,
515                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
516                       ElementCount MinProfitableTripCount,
517                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
518                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
519                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
520       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
521         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
522         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
523         PSI(PSI), RTChecks(RTChecks) {
524     // Query this against the original loop and save it here because the profile
525     // of the original loop header may change as the transformation happens.
526     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
527         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
528 
529     if (MinProfitableTripCount.isZero())
530       this->MinProfitableTripCount = VecWidth;
531     else
532       this->MinProfitableTripCount = MinProfitableTripCount;
533   }
534 
535   virtual ~InnerLoopVectorizer() = default;
536 
537   /// Create a new empty loop that will contain vectorized instructions later
538   /// on, while the old loop will be used as the scalar remainder. Control flow
539   /// is generated around the vectorized (and scalar epilogue) loops consisting
540   /// of various checks and bypasses. Return the pre-header block of the new
541   /// loop and the start value for the canonical induction, if it is != 0. The
542   /// latter is the case when vectorizing the epilogue loop. In the case of
543   /// epilogue vectorization, this function is overriden to handle the more
544   /// complex control flow around the loops.  \p ExpandedSCEVs is used to
545   /// look up SCEV expansions for expressions needed during skeleton creation.
546   virtual std::pair<BasicBlock *, Value *>
547   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
548 
549   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
550   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
551 
552   // Return true if any runtime check is added.
553   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
554 
555   /// A type for vectorized values in the new loop. Each value from the
556   /// original loop, when vectorized, is represented by UF vector values in the
557   /// new unrolled loop, where UF is the unroll factor.
558   using VectorParts = SmallVector<Value *, 2>;
559 
560   /// A helper function to scalarize a single Instruction in the innermost loop.
561   /// Generates a sequence of scalar instances for each lane between \p MinLane
562   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
563   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
564   /// Instr's operands.
565   void scalarizeInstruction(const Instruction *Instr,
566                             VPReplicateRecipe *RepRecipe,
567                             const VPIteration &Instance,
568                             VPTransformState &State);
569 
570   /// Try to vectorize interleaved access group \p Group with the base address
571   /// given in \p Addr, optionally masking the vector operations if \p
572   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
573   /// values in the vectorized loop.
574   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
575                                 ArrayRef<VPValue *> VPDefs,
576                                 VPTransformState &State, VPValue *Addr,
577                                 ArrayRef<VPValue *> StoredValues,
578                                 VPValue *BlockInMask, bool NeedsMaskForGaps);
579 
580   /// Fix the non-induction PHIs in \p Plan.
581   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
582 
583   /// Returns true if the reordering of FP operations is not allowed, but we are
584   /// able to vectorize with strict in-order reductions for the given RdxDesc.
585   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
586 
587   /// Create a new phi node for the induction variable \p OrigPhi to resume
588   /// iteration count in the scalar epilogue, from where the vectorized loop
589   /// left off. \p Step is the SCEV-expanded induction step to use. In cases
590   /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
591   /// and the resume values can come from an additional bypass block, the \p
592   /// AdditionalBypass pair provides information about the bypass block and the
593   /// end value on the edge from bypass to this loop.
594   PHINode *createInductionResumeValue(
595       PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
596       ArrayRef<BasicBlock *> BypassBlocks,
597       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
598 
599   /// Returns the original loop trip count.
600   Value *getTripCount() const { return TripCount; }
601 
602   /// Used to set the trip count after ILV's construction and after the
603   /// preheader block has been executed. Note that this always holds the trip
604   /// count of the original loop for both main loop and epilogue vectorization.
605   void setTripCount(Value *TC) { TripCount = TC; }
606 
607 protected:
608   friend class LoopVectorizationPlanner;
609 
610   /// A small list of PHINodes.
611   using PhiVector = SmallVector<PHINode *, 4>;
612 
613   /// A type for scalarized values in the new loop. Each value from the
614   /// original loop, when scalarized, is represented by UF x VF scalar values
615   /// in the new unrolled loop, where UF is the unroll factor and VF is the
616   /// vectorization factor.
617   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
618 
619   /// Set up the values of the IVs correctly when exiting the vector loop.
620   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
621                     Value *VectorTripCount, Value *EndValue,
622                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
623                     VPlan &Plan, VPTransformState &State);
624 
625   /// Create the exit value of first order recurrences in the middle block and
626   /// update their users.
627   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
628                                VPTransformState &State);
629 
630   /// Create code for the loop exit value of the reduction.
631   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
632 
633   /// Iteratively sink the scalarized operands of a predicated instruction into
634   /// the block that was created for it.
635   void sinkScalarOperands(Instruction *PredInst);
636 
637   /// Returns (and creates if needed) the trip count of the widened loop.
638   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
639 
640   /// Returns a bitcasted value to the requested vector type.
641   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
642   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
643                                 const DataLayout &DL);
644 
645   /// Emit a bypass check to see if the vector trip count is zero, including if
646   /// it overflows.
647   void emitIterationCountCheck(BasicBlock *Bypass);
648 
649   /// Emit a bypass check to see if all of the SCEV assumptions we've
650   /// had to make are correct. Returns the block containing the checks or
651   /// nullptr if no checks have been added.
652   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
653 
654   /// Emit bypass checks to check any memory assumptions we may have made.
655   /// Returns the block containing the checks or nullptr if no checks have been
656   /// added.
657   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
658 
659   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
660   /// vector loop preheader, middle block and scalar preheader.
661   void createVectorLoopSkeleton(StringRef Prefix);
662 
663   /// Create new phi nodes for the induction variables to resume iteration count
664   /// in the scalar epilogue, from where the vectorized loop left off.
665   /// In cases where the loop skeleton is more complicated (eg. epilogue
666   /// vectorization) and the resume values can come from an additional bypass
667   /// block, the \p AdditionalBypass pair provides information about the bypass
668   /// block and the end value on the edge from bypass to this loop.
669   void createInductionResumeValues(
670       const SCEV2ValueTy &ExpandedSCEVs,
671       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
672 
673   /// Complete the loop skeleton by adding debug MDs, creating appropriate
674   /// conditional branches in the middle block, preparing the builder and
675   /// running the verifier. Return the preheader of the completed vector loop.
676   BasicBlock *completeLoopSkeleton();
677 
678   /// Collect poison-generating recipes that may generate a poison value that is
679   /// used after vectorization, even when their operands are not poison. Those
680   /// recipes meet the following conditions:
681   ///  * Contribute to the address computation of a recipe generating a widen
682   ///    memory load/store (VPWidenMemoryInstructionRecipe or
683   ///    VPInterleaveRecipe).
684   ///  * Such a widen memory load/store has at least one underlying Instruction
685   ///    that is in a basic block that needs predication and after vectorization
686   ///    the generated instruction won't be predicated.
687   void collectPoisonGeneratingRecipes(VPTransformState &State);
688 
689   /// Allow subclasses to override and print debug traces before/after vplan
690   /// execution, when trace information is requested.
691   virtual void printDebugTracesAtStart(){};
692   virtual void printDebugTracesAtEnd(){};
693 
694   /// The original loop.
695   Loop *OrigLoop;
696 
697   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
698   /// dynamic knowledge to simplify SCEV expressions and converts them to a
699   /// more usable form.
700   PredicatedScalarEvolution &PSE;
701 
702   /// Loop Info.
703   LoopInfo *LI;
704 
705   /// Dominator Tree.
706   DominatorTree *DT;
707 
708   /// Target Library Info.
709   const TargetLibraryInfo *TLI;
710 
711   /// Target Transform Info.
712   const TargetTransformInfo *TTI;
713 
714   /// Assumption Cache.
715   AssumptionCache *AC;
716 
717   /// Interface to emit optimization remarks.
718   OptimizationRemarkEmitter *ORE;
719 
720   /// The vectorization SIMD factor to use. Each vector will have this many
721   /// vector elements.
722   ElementCount VF;
723 
724   ElementCount MinProfitableTripCount;
725 
726   /// The vectorization unroll factor to use. Each scalar is vectorized to this
727   /// many different vector instructions.
728   unsigned UF;
729 
730   /// The builder that we use
731   IRBuilder<> Builder;
732 
733   // --- Vectorization state ---
734 
735   /// The vector-loop preheader.
736   BasicBlock *LoopVectorPreHeader;
737 
738   /// The scalar-loop preheader.
739   BasicBlock *LoopScalarPreHeader;
740 
741   /// Middle Block between the vector and the scalar.
742   BasicBlock *LoopMiddleBlock;
743 
744   /// The unique ExitBlock of the scalar loop if one exists.  Note that
745   /// there can be multiple exiting edges reaching this block.
746   BasicBlock *LoopExitBlock;
747 
748   /// The scalar loop body.
749   BasicBlock *LoopScalarBody;
750 
751   /// A list of all bypass blocks. The first block is the entry of the loop.
752   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
753 
754   /// Store instructions that were predicated.
755   SmallVector<Instruction *, 4> PredicatedInstructions;
756 
757   /// Trip count of the original loop.
758   Value *TripCount = nullptr;
759 
760   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
761   Value *VectorTripCount = nullptr;
762 
763   /// The legality analysis.
764   LoopVectorizationLegality *Legal;
765 
766   /// The profitablity analysis.
767   LoopVectorizationCostModel *Cost;
768 
769   // Record whether runtime checks are added.
770   bool AddedSafetyChecks = false;
771 
772   // Holds the end values for each induction variable. We save the end values
773   // so we can later fix-up the external users of the induction variables.
774   DenseMap<PHINode *, Value *> IVEndValues;
775 
776   /// BFI and PSI are used to check for profile guided size optimizations.
777   BlockFrequencyInfo *BFI;
778   ProfileSummaryInfo *PSI;
779 
780   // Whether this loop should be optimized for size based on profile guided size
781   // optimizatios.
782   bool OptForSizeBasedOnProfile;
783 
784   /// Structure to hold information about generated runtime checks, responsible
785   /// for cleaning the checks, if vectorization turns out unprofitable.
786   GeneratedRTChecks &RTChecks;
787 
788   // Holds the resume values for reductions in the loops, used to set the
789   // correct start value of reduction PHIs when vectorizing the epilogue.
790   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
791       ReductionResumeValues;
792 };
793 
794 class InnerLoopUnroller : public InnerLoopVectorizer {
795 public:
796   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
797                     LoopInfo *LI, DominatorTree *DT,
798                     const TargetLibraryInfo *TLI,
799                     const TargetTransformInfo *TTI, AssumptionCache *AC,
800                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
801                     LoopVectorizationLegality *LVL,
802                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
803                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
804       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
805                             ElementCount::getFixed(1),
806                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
807                             BFI, PSI, Check) {}
808 };
809 
810 /// Encapsulate information regarding vectorization of a loop and its epilogue.
811 /// This information is meant to be updated and used across two stages of
812 /// epilogue vectorization.
813 struct EpilogueLoopVectorizationInfo {
814   ElementCount MainLoopVF = ElementCount::getFixed(0);
815   unsigned MainLoopUF = 0;
816   ElementCount EpilogueVF = ElementCount::getFixed(0);
817   unsigned EpilogueUF = 0;
818   BasicBlock *MainLoopIterationCountCheck = nullptr;
819   BasicBlock *EpilogueIterationCountCheck = nullptr;
820   BasicBlock *SCEVSafetyCheck = nullptr;
821   BasicBlock *MemSafetyCheck = nullptr;
822   Value *TripCount = nullptr;
823   Value *VectorTripCount = nullptr;
824 
825   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
826                                 ElementCount EVF, unsigned EUF)
827       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
828     assert(EUF == 1 &&
829            "A high UF for the epilogue loop is likely not beneficial.");
830   }
831 };
832 
833 /// An extension of the inner loop vectorizer that creates a skeleton for a
834 /// vectorized loop that has its epilogue (residual) also vectorized.
835 /// The idea is to run the vplan on a given loop twice, firstly to setup the
836 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
837 /// from the first step and vectorize the epilogue.  This is achieved by
838 /// deriving two concrete strategy classes from this base class and invoking
839 /// them in succession from the loop vectorizer planner.
840 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
841 public:
842   InnerLoopAndEpilogueVectorizer(
843       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
844       DominatorTree *DT, const TargetLibraryInfo *TLI,
845       const TargetTransformInfo *TTI, AssumptionCache *AC,
846       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
847       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
848       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
849       GeneratedRTChecks &Checks)
850       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
851                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
852                             CM, BFI, PSI, Checks),
853         EPI(EPI) {}
854 
855   // Override this function to handle the more complex control flow around the
856   // three loops.
857   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
858       const SCEV2ValueTy &ExpandedSCEVs) final {
859     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
860   }
861 
862   /// The interface for creating a vectorized skeleton using one of two
863   /// different strategies, each corresponding to one execution of the vplan
864   /// as described above.
865   virtual std::pair<BasicBlock *, Value *>
866   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
867 
868   /// Holds and updates state information required to vectorize the main loop
869   /// and its epilogue in two separate passes. This setup helps us avoid
870   /// regenerating and recomputing runtime safety checks. It also helps us to
871   /// shorten the iteration-count-check path length for the cases where the
872   /// iteration count of the loop is so small that the main vector loop is
873   /// completely skipped.
874   EpilogueLoopVectorizationInfo &EPI;
875 };
876 
877 /// A specialized derived class of inner loop vectorizer that performs
878 /// vectorization of *main* loops in the process of vectorizing loops and their
879 /// epilogues.
880 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
881 public:
882   EpilogueVectorizerMainLoop(
883       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
884       DominatorTree *DT, const TargetLibraryInfo *TLI,
885       const TargetTransformInfo *TTI, AssumptionCache *AC,
886       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
887       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
888       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
889       GeneratedRTChecks &Check)
890       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
891                                        EPI, LVL, CM, BFI, PSI, Check) {}
892   /// Implements the interface for creating a vectorized skeleton using the
893   /// *main loop* strategy (ie the first pass of vplan execution).
894   std::pair<BasicBlock *, Value *>
895   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
896 
897 protected:
898   /// Emits an iteration count bypass check once for the main loop (when \p
899   /// ForEpilogue is false) and once for the epilogue loop (when \p
900   /// ForEpilogue is true).
901   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
902   void printDebugTracesAtStart() override;
903   void printDebugTracesAtEnd() override;
904 };
905 
906 // A specialized derived class of inner loop vectorizer that performs
907 // vectorization of *epilogue* loops in the process of vectorizing loops and
908 // their epilogues.
909 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
910 public:
911   EpilogueVectorizerEpilogueLoop(
912       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
913       DominatorTree *DT, const TargetLibraryInfo *TLI,
914       const TargetTransformInfo *TTI, AssumptionCache *AC,
915       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
916       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
917       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
918       GeneratedRTChecks &Checks)
919       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
920                                        EPI, LVL, CM, BFI, PSI, Checks) {
921     TripCount = EPI.TripCount;
922   }
923   /// Implements the interface for creating a vectorized skeleton using the
924   /// *epilogue loop* strategy (ie the second pass of vplan execution).
925   std::pair<BasicBlock *, Value *>
926   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
927 
928 protected:
929   /// Emits an iteration count bypass check after the main vector loop has
930   /// finished to see if there are any iterations left to execute by either
931   /// the vector epilogue or the scalar epilogue.
932   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
933                                                       BasicBlock *Bypass,
934                                                       BasicBlock *Insert);
935   void printDebugTracesAtStart() override;
936   void printDebugTracesAtEnd() override;
937 };
938 } // end namespace llvm
939 
940 /// Look for a meaningful debug location on the instruction or it's
941 /// operands.
942 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
943   if (!I)
944     return DebugLoc();
945 
946   DebugLoc Empty;
947   if (I->getDebugLoc() != Empty)
948     return I->getDebugLoc();
949 
950   for (Use &Op : I->operands()) {
951     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
952       if (OpInst->getDebugLoc() != Empty)
953         return OpInst->getDebugLoc();
954   }
955 
956   return I->getDebugLoc();
957 }
958 
959 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
960 /// is passed, the message relates to that particular instruction.
961 #ifndef NDEBUG
962 static void debugVectorizationMessage(const StringRef Prefix,
963                                       const StringRef DebugMsg,
964                                       Instruction *I) {
965   dbgs() << "LV: " << Prefix << DebugMsg;
966   if (I != nullptr)
967     dbgs() << " " << *I;
968   else
969     dbgs() << '.';
970   dbgs() << '\n';
971 }
972 #endif
973 
974 /// Create an analysis remark that explains why vectorization failed
975 ///
976 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
977 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
978 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
979 /// the location of the remark.  \return the remark object that can be
980 /// streamed to.
981 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
982     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
983   Value *CodeRegion = TheLoop->getHeader();
984   DebugLoc DL = TheLoop->getStartLoc();
985 
986   if (I) {
987     CodeRegion = I->getParent();
988     // If there is no debug location attached to the instruction, revert back to
989     // using the loop's.
990     if (I->getDebugLoc())
991       DL = I->getDebugLoc();
992   }
993 
994   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
995 }
996 
997 namespace llvm {
998 
999 /// Return a value for Step multiplied by VF.
1000 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1001                        int64_t Step) {
1002   assert(Ty->isIntegerTy() && "Expected an integer step");
1003   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
1004 }
1005 
1006 /// Return the runtime value for VF.
1007 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1008   return B.CreateElementCount(Ty, VF);
1009 }
1010 
1011 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
1012                                 Loop *OrigLoop) {
1013   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
1014   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
1015 
1016   ScalarEvolution &SE = *PSE.getSE();
1017   return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
1018 }
1019 
1020 void reportVectorizationFailure(const StringRef DebugMsg,
1021                                 const StringRef OREMsg, const StringRef ORETag,
1022                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1023                                 Instruction *I) {
1024   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1025   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1026   ORE->emit(
1027       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1028       << "loop not vectorized: " << OREMsg);
1029 }
1030 
1031 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1032                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1033                              Instruction *I) {
1034   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1035   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1036   ORE->emit(
1037       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1038       << Msg);
1039 }
1040 
1041 /// Report successful vectorization of the loop. In case an outer loop is
1042 /// vectorized, prepend "outer" to the vectorization remark.
1043 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1044                                 VectorizationFactor VF, unsigned IC) {
1045   LLVM_DEBUG(debugVectorizationMessage(
1046       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1047       nullptr));
1048   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1049   ORE->emit([&]() {
1050     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1051                               TheLoop->getHeader())
1052            << "vectorized " << LoopType << "loop (vectorization width: "
1053            << ore::NV("VectorizationFactor", VF.Width)
1054            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1055   });
1056 }
1057 
1058 } // end namespace llvm
1059 
1060 #ifndef NDEBUG
1061 /// \return string containing a file name and a line # for the given loop.
1062 static std::string getDebugLocString(const Loop *L) {
1063   std::string Result;
1064   if (L) {
1065     raw_string_ostream OS(Result);
1066     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1067       LoopDbgLoc.print(OS);
1068     else
1069       // Just print the module name.
1070       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1071     OS.flush();
1072   }
1073   return Result;
1074 }
1075 #endif
1076 
1077 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1078     VPTransformState &State) {
1079 
1080   // Collect recipes in the backward slice of `Root` that may generate a poison
1081   // value that is used after vectorization.
1082   SmallPtrSet<VPRecipeBase *, 16> Visited;
1083   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1084     SmallVector<VPRecipeBase *, 16> Worklist;
1085     Worklist.push_back(Root);
1086 
1087     // Traverse the backward slice of Root through its use-def chain.
1088     while (!Worklist.empty()) {
1089       VPRecipeBase *CurRec = Worklist.back();
1090       Worklist.pop_back();
1091 
1092       if (!Visited.insert(CurRec).second)
1093         continue;
1094 
1095       // Prune search if we find another recipe generating a widen memory
1096       // instruction. Widen memory instructions involved in address computation
1097       // will lead to gather/scatter instructions, which don't need to be
1098       // handled.
1099       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1100           isa<VPInterleaveRecipe>(CurRec) ||
1101           isa<VPScalarIVStepsRecipe>(CurRec) ||
1102           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1103           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1104         continue;
1105 
1106       // This recipe contributes to the address computation of a widen
1107       // load/store. If the underlying instruction has poison-generating flags,
1108       // drop them directly.
1109       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1110         RecWithFlags->dropPoisonGeneratingFlags();
1111       } else {
1112         Instruction *Instr = dyn_cast_or_null<Instruction>(
1113             CurRec->getVPSingleValue()->getUnderlyingValue());
1114         (void)Instr;
1115         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1116                "found instruction with poison generating flags not covered by "
1117                "VPRecipeWithIRFlags");
1118       }
1119 
1120       // Add new definitions to the worklist.
1121       for (VPValue *operand : CurRec->operands())
1122         if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1123           Worklist.push_back(OpDef);
1124     }
1125   });
1126 
1127   // Traverse all the recipes in the VPlan and collect the poison-generating
1128   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1129   // VPInterleaveRecipe.
1130   auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1131   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1132     for (VPRecipeBase &Recipe : *VPBB) {
1133       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1134         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1135         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1136         if (AddrDef && WidenRec->isConsecutive() &&
1137             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1138           collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1139       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1140         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1141         if (AddrDef) {
1142           // Check if any member of the interleave group needs predication.
1143           const InterleaveGroup<Instruction> *InterGroup =
1144               InterleaveRec->getInterleaveGroup();
1145           bool NeedPredication = false;
1146           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1147                I < NumMembers; ++I) {
1148             Instruction *Member = InterGroup->getMember(I);
1149             if (Member)
1150               NeedPredication |=
1151                   Legal->blockNeedsPredication(Member->getParent());
1152           }
1153 
1154           if (NeedPredication)
1155             collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1156         }
1157       }
1158     }
1159   }
1160 }
1161 
1162 namespace llvm {
1163 
1164 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1165 // lowered.
1166 enum ScalarEpilogueLowering {
1167 
1168   // The default: allowing scalar epilogues.
1169   CM_ScalarEpilogueAllowed,
1170 
1171   // Vectorization with OptForSize: don't allow epilogues.
1172   CM_ScalarEpilogueNotAllowedOptSize,
1173 
1174   // A special case of vectorisation with OptForSize: loops with a very small
1175   // trip count are considered for vectorization under OptForSize, thereby
1176   // making sure the cost of their loop body is dominant, free of runtime
1177   // guards and scalar iteration overheads.
1178   CM_ScalarEpilogueNotAllowedLowTripLoop,
1179 
1180   // Loop hint predicate indicating an epilogue is undesired.
1181   CM_ScalarEpilogueNotNeededUsePredicate,
1182 
1183   // Directive indicating we must either tail fold or not vectorize
1184   CM_ScalarEpilogueNotAllowedUsePredicate
1185 };
1186 
1187 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1188 
1189 /// LoopVectorizationCostModel - estimates the expected speedups due to
1190 /// vectorization.
1191 /// In many cases vectorization is not profitable. This can happen because of
1192 /// a number of reasons. In this class we mainly attempt to predict the
1193 /// expected speedup/slowdowns due to the supported instruction set. We use the
1194 /// TargetTransformInfo to query the different backends for the cost of
1195 /// different operations.
1196 class LoopVectorizationCostModel {
1197 public:
1198   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1199                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1200                              LoopVectorizationLegality *Legal,
1201                              const TargetTransformInfo &TTI,
1202                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1203                              AssumptionCache *AC,
1204                              OptimizationRemarkEmitter *ORE, const Function *F,
1205                              const LoopVectorizeHints *Hints,
1206                              InterleavedAccessInfo &IAI)
1207       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1208         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1209         Hints(Hints), InterleaveInfo(IAI) {}
1210 
1211   /// \return An upper bound for the vectorization factors (both fixed and
1212   /// scalable). If the factors are 0, vectorization and interleaving should be
1213   /// avoided up front.
1214   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1215 
1216   /// \return True if runtime checks are required for vectorization, and false
1217   /// otherwise.
1218   bool runtimeChecksRequired();
1219 
1220   /// Setup cost-based decisions for user vectorization factor.
1221   /// \return true if the UserVF is a feasible VF to be chosen.
1222   bool selectUserVectorizationFactor(ElementCount UserVF) {
1223     collectUniformsAndScalars(UserVF);
1224     collectInstsToScalarize(UserVF);
1225     return expectedCost(UserVF).first.isValid();
1226   }
1227 
1228   /// \return The size (in bits) of the smallest and widest types in the code
1229   /// that needs to be vectorized. We ignore values that remain scalar such as
1230   /// 64 bit loop indices.
1231   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1232 
1233   /// \return The desired interleave count.
1234   /// If interleave count has been specified by metadata it will be returned.
1235   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1236   /// are the selected vectorization factor and the cost of the selected VF.
1237   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1238 
1239   /// Memory access instruction may be vectorized in more than one way.
1240   /// Form of instruction after vectorization depends on cost.
1241   /// This function takes cost-based decisions for Load/Store instructions
1242   /// and collects them in a map. This decisions map is used for building
1243   /// the lists of loop-uniform and loop-scalar instructions.
1244   /// The calculated cost is saved with widening decision in order to
1245   /// avoid redundant calculations.
1246   void setCostBasedWideningDecision(ElementCount VF);
1247 
1248   /// A call may be vectorized in different ways depending on whether we have
1249   /// vectorized variants available and whether the target supports masking.
1250   /// This function analyzes all calls in the function at the supplied VF,
1251   /// makes a decision based on the costs of available options, and stores that
1252   /// decision in a map for use in planning and plan execution.
1253   void setVectorizedCallDecision(ElementCount VF);
1254 
1255   /// A struct that represents some properties of the register usage
1256   /// of a loop.
1257   struct RegisterUsage {
1258     /// Holds the number of loop invariant values that are used in the loop.
1259     /// The key is ClassID of target-provided register class.
1260     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1261     /// Holds the maximum number of concurrent live intervals in the loop.
1262     /// The key is ClassID of target-provided register class.
1263     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1264   };
1265 
1266   /// \return Returns information about the register usages of the loop for the
1267   /// given vectorization factors.
1268   SmallVector<RegisterUsage, 8>
1269   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1270 
1271   /// Collect values we want to ignore in the cost model.
1272   void collectValuesToIgnore();
1273 
1274   /// Collect all element types in the loop for which widening is needed.
1275   void collectElementTypesForWidening();
1276 
1277   /// Split reductions into those that happen in the loop, and those that happen
1278   /// outside. In loop reductions are collected into InLoopReductions.
1279   void collectInLoopReductions();
1280 
1281   /// Returns true if we should use strict in-order reductions for the given
1282   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1283   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1284   /// of FP operations.
1285   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1286     return !Hints->allowReordering() && RdxDesc.isOrdered();
1287   }
1288 
1289   /// \returns The smallest bitwidth each instruction can be represented with.
1290   /// The vector equivalents of these instructions should be truncated to this
1291   /// type.
1292   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1293     return MinBWs;
1294   }
1295 
1296   /// \returns True if it is more profitable to scalarize instruction \p I for
1297   /// vectorization factor \p VF.
1298   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1299     assert(VF.isVector() &&
1300            "Profitable to scalarize relevant only for VF > 1.");
1301 
1302     // Cost model is not run in the VPlan-native path - return conservative
1303     // result until this changes.
1304     if (EnableVPlanNativePath)
1305       return false;
1306 
1307     auto Scalars = InstsToScalarize.find(VF);
1308     assert(Scalars != InstsToScalarize.end() &&
1309            "VF not yet analyzed for scalarization profitability");
1310     return Scalars->second.contains(I);
1311   }
1312 
1313   /// Returns true if \p I is known to be uniform after vectorization.
1314   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1315     // Pseudo probe needs to be duplicated for each unrolled iteration and
1316     // vector lane so that profiled loop trip count can be accurately
1317     // accumulated instead of being under counted.
1318     if (isa<PseudoProbeInst>(I))
1319       return false;
1320 
1321     if (VF.isScalar())
1322       return true;
1323 
1324     // Cost model is not run in the VPlan-native path - return conservative
1325     // result until this changes.
1326     if (EnableVPlanNativePath)
1327       return false;
1328 
1329     auto UniformsPerVF = Uniforms.find(VF);
1330     assert(UniformsPerVF != Uniforms.end() &&
1331            "VF not yet analyzed for uniformity");
1332     return UniformsPerVF->second.count(I);
1333   }
1334 
1335   /// Returns true if \p I is known to be scalar after vectorization.
1336   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1337     if (VF.isScalar())
1338       return true;
1339 
1340     // Cost model is not run in the VPlan-native path - return conservative
1341     // result until this changes.
1342     if (EnableVPlanNativePath)
1343       return false;
1344 
1345     auto ScalarsPerVF = Scalars.find(VF);
1346     assert(ScalarsPerVF != Scalars.end() &&
1347            "Scalar values are not calculated for VF");
1348     return ScalarsPerVF->second.count(I);
1349   }
1350 
1351   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1352   /// for vectorization factor \p VF.
1353   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1354     return VF.isVector() && MinBWs.contains(I) &&
1355            !isProfitableToScalarize(I, VF) &&
1356            !isScalarAfterVectorization(I, VF);
1357   }
1358 
1359   /// Decision that was taken during cost calculation for memory instruction.
1360   enum InstWidening {
1361     CM_Unknown,
1362     CM_Widen,         // For consecutive accesses with stride +1.
1363     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1364     CM_Interleave,
1365     CM_GatherScatter,
1366     CM_Scalarize,
1367     CM_VectorCall,
1368     CM_IntrinsicCall
1369   };
1370 
1371   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1372   /// instruction \p I and vector width \p VF.
1373   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1374                            InstructionCost Cost) {
1375     assert(VF.isVector() && "Expected VF >=2");
1376     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1377   }
1378 
1379   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1380   /// interleaving group \p Grp and vector width \p VF.
1381   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1382                            ElementCount VF, InstWidening W,
1383                            InstructionCost Cost) {
1384     assert(VF.isVector() && "Expected VF >=2");
1385     /// Broadcast this decicion to all instructions inside the group.
1386     /// But the cost will be assigned to one instruction only.
1387     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1388       if (auto *I = Grp->getMember(i)) {
1389         if (Grp->getInsertPos() == I)
1390           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1391         else
1392           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1393       }
1394     }
1395   }
1396 
1397   /// Return the cost model decision for the given instruction \p I and vector
1398   /// width \p VF. Return CM_Unknown if this instruction did not pass
1399   /// through the cost modeling.
1400   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1401     assert(VF.isVector() && "Expected VF to be a vector VF");
1402     // Cost model is not run in the VPlan-native path - return conservative
1403     // result until this changes.
1404     if (EnableVPlanNativePath)
1405       return CM_GatherScatter;
1406 
1407     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1408     auto Itr = WideningDecisions.find(InstOnVF);
1409     if (Itr == WideningDecisions.end())
1410       return CM_Unknown;
1411     return Itr->second.first;
1412   }
1413 
1414   /// Return the vectorization cost for the given instruction \p I and vector
1415   /// width \p VF.
1416   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1417     assert(VF.isVector() && "Expected VF >=2");
1418     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1419     assert(WideningDecisions.contains(InstOnVF) &&
1420            "The cost is not calculated");
1421     return WideningDecisions[InstOnVF].second;
1422   }
1423 
1424   struct CallWideningDecision {
1425     InstWidening Kind;
1426     Function *Variant;
1427     Intrinsic::ID IID;
1428     std::optional<unsigned> MaskPos;
1429     InstructionCost Cost;
1430   };
1431 
1432   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1433                                Function *Variant, Intrinsic::ID IID,
1434                                std::optional<unsigned> MaskPos,
1435                                InstructionCost Cost) {
1436     assert(!VF.isScalar() && "Expected vector VF");
1437     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1438                                                      MaskPos, Cost};
1439   }
1440 
1441   CallWideningDecision getCallWideningDecision(CallInst *CI,
1442                                                ElementCount VF) const {
1443     assert(!VF.isScalar() && "Expected vector VF");
1444     return CallWideningDecisions.at(std::make_pair(CI, VF));
1445   }
1446 
1447   /// Return True if instruction \p I is an optimizable truncate whose operand
1448   /// is an induction variable. Such a truncate will be removed by adding a new
1449   /// induction variable with the destination type.
1450   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1451     // If the instruction is not a truncate, return false.
1452     auto *Trunc = dyn_cast<TruncInst>(I);
1453     if (!Trunc)
1454       return false;
1455 
1456     // Get the source and destination types of the truncate.
1457     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1458     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1459 
1460     // If the truncate is free for the given types, return false. Replacing a
1461     // free truncate with an induction variable would add an induction variable
1462     // update instruction to each iteration of the loop. We exclude from this
1463     // check the primary induction variable since it will need an update
1464     // instruction regardless.
1465     Value *Op = Trunc->getOperand(0);
1466     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1467       return false;
1468 
1469     // If the truncated value is not an induction variable, return false.
1470     return Legal->isInductionPhi(Op);
1471   }
1472 
1473   /// Collects the instructions to scalarize for each predicated instruction in
1474   /// the loop.
1475   void collectInstsToScalarize(ElementCount VF);
1476 
1477   /// Collect Uniform and Scalar values for the given \p VF.
1478   /// The sets depend on CM decision for Load/Store instructions
1479   /// that may be vectorized as interleave, gather-scatter or scalarized.
1480   /// Also make a decision on what to do about call instructions in the loop
1481   /// at that VF -- scalarize, call a known vector routine, or call a
1482   /// vector intrinsic.
1483   void collectUniformsAndScalars(ElementCount VF) {
1484     // Do the analysis once.
1485     if (VF.isScalar() || Uniforms.contains(VF))
1486       return;
1487     setCostBasedWideningDecision(VF);
1488     setVectorizedCallDecision(VF);
1489     collectLoopUniforms(VF);
1490     collectLoopScalars(VF);
1491   }
1492 
1493   /// Returns true if the target machine supports masked store operation
1494   /// for the given \p DataType and kind of access to \p Ptr.
1495   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1496     return Legal->isConsecutivePtr(DataType, Ptr) &&
1497            TTI.isLegalMaskedStore(DataType, Alignment);
1498   }
1499 
1500   /// Returns true if the target machine supports masked load operation
1501   /// for the given \p DataType and kind of access to \p Ptr.
1502   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1503     return Legal->isConsecutivePtr(DataType, Ptr) &&
1504            TTI.isLegalMaskedLoad(DataType, Alignment);
1505   }
1506 
1507   /// Returns true if the target machine can represent \p V as a masked gather
1508   /// or scatter operation.
1509   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1510     bool LI = isa<LoadInst>(V);
1511     bool SI = isa<StoreInst>(V);
1512     if (!LI && !SI)
1513       return false;
1514     auto *Ty = getLoadStoreType(V);
1515     Align Align = getLoadStoreAlignment(V);
1516     if (VF.isVector())
1517       Ty = VectorType::get(Ty, VF);
1518     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1519            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1520   }
1521 
1522   /// Returns true if the target machine supports all of the reduction
1523   /// variables found for the given VF.
1524   bool canVectorizeReductions(ElementCount VF) const {
1525     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1526       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1528     }));
1529   }
1530 
1531   /// Given costs for both strategies, return true if the scalar predication
1532   /// lowering should be used for div/rem.  This incorporates an override
1533   /// option so it is not simply a cost comparison.
1534   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1535                                      InstructionCost SafeDivisorCost) const {
1536     switch (ForceSafeDivisor) {
1537     case cl::BOU_UNSET:
1538       return ScalarCost < SafeDivisorCost;
1539     case cl::BOU_TRUE:
1540       return false;
1541     case cl::BOU_FALSE:
1542       return true;
1543     };
1544     llvm_unreachable("impossible case value");
1545   }
1546 
1547   /// Returns true if \p I is an instruction which requires predication and
1548   /// for which our chosen predication strategy is scalarization (i.e. we
1549   /// don't have an alternate strategy such as masking available).
1550   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1551   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1552 
1553   /// Returns true if \p I is an instruction that needs to be predicated
1554   /// at runtime.  The result is independent of the predication mechanism.
1555   /// Superset of instructions that return true for isScalarWithPredication.
1556   bool isPredicatedInst(Instruction *I) const;
1557 
1558   /// Return the costs for our two available strategies for lowering a
1559   /// div/rem operation which requires speculating at least one lane.
1560   /// First result is for scalarization (will be invalid for scalable
1561   /// vectors); second is for the safe-divisor strategy.
1562   std::pair<InstructionCost, InstructionCost>
1563   getDivRemSpeculationCost(Instruction *I,
1564                            ElementCount VF) const;
1565 
1566   /// Returns true if \p I is a memory instruction with consecutive memory
1567   /// access that can be widened.
1568   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1569 
1570   /// Returns true if \p I is a memory instruction in an interleaved-group
1571   /// of memory accesses that can be vectorized with wide vector loads/stores
1572   /// and shuffles.
1573   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1574 
1575   /// Check if \p Instr belongs to any interleaved access group.
1576   bool isAccessInterleaved(Instruction *Instr) {
1577     return InterleaveInfo.isInterleaved(Instr);
1578   }
1579 
1580   /// Get the interleaved access group that \p Instr belongs to.
1581   const InterleaveGroup<Instruction> *
1582   getInterleavedAccessGroup(Instruction *Instr) {
1583     return InterleaveInfo.getInterleaveGroup(Instr);
1584   }
1585 
1586   /// Returns true if we're required to use a scalar epilogue for at least
1587   /// the final iteration of the original loop.
1588   bool requiresScalarEpilogue(bool IsVectorizing) const {
1589     if (!isScalarEpilogueAllowed())
1590       return false;
1591     // If we might exit from anywhere but the latch, must run the exiting
1592     // iteration in scalar form.
1593     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1594       return true;
1595     return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1596   }
1597 
1598   /// Returns true if we're required to use a scalar epilogue for at least
1599   /// the final iteration of the original loop for all VFs in \p Range.
1600   /// A scalar epilogue must either be required for all VFs in \p Range or for
1601   /// none.
1602   bool requiresScalarEpilogue(VFRange Range) const {
1603     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1604       return requiresScalarEpilogue(VF.isVector());
1605     };
1606     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1607     assert(
1608         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1609         "all VFs in range must agree on whether a scalar epilogue is required");
1610     return IsRequired;
1611   }
1612 
1613   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1614   /// loop hint annotation.
1615   bool isScalarEpilogueAllowed() const {
1616     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1617   }
1618 
1619   /// Returns the TailFoldingStyle that is best for the current loop.
1620   TailFoldingStyle
1621   getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1622     if (!CanFoldTailByMasking)
1623       return TailFoldingStyle::None;
1624 
1625     if (ForceTailFoldingStyle.getNumOccurrences())
1626       return ForceTailFoldingStyle;
1627 
1628     return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
1629   }
1630 
1631   /// Returns true if all loop blocks should be masked to fold tail loop.
1632   bool foldTailByMasking() const {
1633     return getTailFoldingStyle() != TailFoldingStyle::None;
1634   }
1635 
1636   /// Returns true if the instructions in this block requires predication
1637   /// for any reason, e.g. because tail folding now requires a predicate
1638   /// or because the block in the original loop was predicated.
1639   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1640     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1641   }
1642 
1643   /// Returns true if the Phi is part of an inloop reduction.
1644   bool isInLoopReduction(PHINode *Phi) const {
1645     return InLoopReductions.contains(Phi);
1646   }
1647 
1648   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1649   /// with factor VF.  Return the cost of the instruction, including
1650   /// scalarization overhead if it's needed.
1651   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1652 
1653   /// Estimate cost of a call instruction CI if it were vectorized with factor
1654   /// VF. Return the cost of the instruction, including scalarization overhead
1655   /// if it's needed.
1656   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1657 
1658   /// Invalidates decisions already taken by the cost model.
1659   void invalidateCostModelingDecisions() {
1660     WideningDecisions.clear();
1661     CallWideningDecisions.clear();
1662     Uniforms.clear();
1663     Scalars.clear();
1664   }
1665 
1666   /// The vectorization cost is a combination of the cost itself and a boolean
1667   /// indicating whether any of the contributing operations will actually
1668   /// operate on vector values after type legalization in the backend. If this
1669   /// latter value is false, then all operations will be scalarized (i.e. no
1670   /// vectorization has actually taken place).
1671   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1672 
1673   /// Returns the expected execution cost. The unit of the cost does
1674   /// not matter because we use the 'cost' units to compare different
1675   /// vector widths. The cost that is returned is *not* normalized by
1676   /// the factor width. If \p Invalid is not nullptr, this function
1677   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678   /// each instruction that has an Invalid cost for the given VF.
1679   VectorizationCostTy
1680   expectedCost(ElementCount VF,
1681                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1682 
1683   bool hasPredStores() const { return NumPredStores > 0; }
1684 
1685   /// Returns true if epilogue vectorization is considered profitable, and
1686   /// false otherwise.
1687   /// \p VF is the vectorization factor chosen for the original loop.
1688   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1689 
1690 private:
1691   unsigned NumPredStores = 0;
1692 
1693   /// \return An upper bound for the vectorization factors for both
1694   /// fixed and scalable vectorization, where the minimum-known number of
1695   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1696   /// disabled or unsupported, then the scalable part will be equal to
1697   /// ElementCount::getScalable(0).
1698   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1699                                            ElementCount UserVF,
1700                                            bool FoldTailByMasking);
1701 
1702   /// \return the maximized element count based on the targets vector
1703   /// registers and the loop trip-count, but limited to a maximum safe VF.
1704   /// This is a helper function of computeFeasibleMaxVF.
1705   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1706                                        unsigned SmallestType,
1707                                        unsigned WidestType,
1708                                        ElementCount MaxSafeVF,
1709                                        bool FoldTailByMasking);
1710 
1711   /// \return the maximum legal scalable VF, based on the safe max number
1712   /// of elements.
1713   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1714 
1715   /// Returns the execution time cost of an instruction for a given vector
1716   /// width. Vector width of one means scalar.
1717   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1718 
1719   /// The cost-computation logic from getInstructionCost which provides
1720   /// the vector type as an output parameter.
1721   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1722                                      Type *&VectorTy);
1723 
1724   /// Return the cost of instructions in an inloop reduction pattern, if I is
1725   /// part of that pattern.
1726   std::optional<InstructionCost>
1727   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1728                           TTI::TargetCostKind CostKind) const;
1729 
1730   /// Calculate vectorization cost of memory instruction \p I.
1731   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1732 
1733   /// The cost computation for scalarized memory instruction.
1734   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1735 
1736   /// The cost computation for interleaving group of memory instructions.
1737   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1738 
1739   /// The cost computation for Gather/Scatter instruction.
1740   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1741 
1742   /// The cost computation for widening instruction \p I with consecutive
1743   /// memory access.
1744   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1745 
1746   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1747   /// Load: scalar load + broadcast.
1748   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1749   /// element)
1750   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1751 
1752   /// Estimate the overhead of scalarizing an instruction. This is a
1753   /// convenience wrapper for the type-based getScalarizationOverhead API.
1754   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1755                                            TTI::TargetCostKind CostKind) const;
1756 
1757   /// Returns true if an artificially high cost for emulated masked memrefs
1758   /// should be used.
1759   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1760 
1761   /// Map of scalar integer values to the smallest bitwidth they can be legally
1762   /// represented as. The vector equivalents of these values should be truncated
1763   /// to this type.
1764   MapVector<Instruction *, uint64_t> MinBWs;
1765 
1766   /// A type representing the costs for instructions if they were to be
1767   /// scalarized rather than vectorized. The entries are Instruction-Cost
1768   /// pairs.
1769   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1770 
1771   /// A set containing all BasicBlocks that are known to present after
1772   /// vectorization as a predicated block.
1773   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1774       PredicatedBBsAfterVectorization;
1775 
1776   /// Records whether it is allowed to have the original scalar loop execute at
1777   /// least once. This may be needed as a fallback loop in case runtime
1778   /// aliasing/dependence checks fail, or to handle the tail/remainder
1779   /// iterations when the trip count is unknown or doesn't divide by the VF,
1780   /// or as a peel-loop to handle gaps in interleave-groups.
1781   /// Under optsize and when the trip count is very small we don't allow any
1782   /// iterations to execute in the scalar loop.
1783   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1784 
1785   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1786   bool CanFoldTailByMasking = false;
1787 
1788   /// A map holding scalar costs for different vectorization factors. The
1789   /// presence of a cost for an instruction in the mapping indicates that the
1790   /// instruction will be scalarized when vectorizing with the associated
1791   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1792   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1793 
1794   /// Holds the instructions known to be uniform after vectorization.
1795   /// The data is collected per VF.
1796   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1797 
1798   /// Holds the instructions known to be scalar after vectorization.
1799   /// The data is collected per VF.
1800   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1801 
1802   /// Holds the instructions (address computations) that are forced to be
1803   /// scalarized.
1804   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1805 
1806   /// PHINodes of the reductions that should be expanded in-loop.
1807   SmallPtrSet<PHINode *, 4> InLoopReductions;
1808 
1809   /// A Map of inloop reduction operations and their immediate chain operand.
1810   /// FIXME: This can be removed once reductions can be costed correctly in
1811   /// VPlan. This was added to allow quick lookup of the inloop operations.
1812   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1813 
1814   /// Returns the expected difference in cost from scalarizing the expression
1815   /// feeding a predicated instruction \p PredInst. The instructions to
1816   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1817   /// non-negative return value implies the expression will be scalarized.
1818   /// Currently, only single-use chains are considered for scalarization.
1819   InstructionCost computePredInstDiscount(Instruction *PredInst,
1820                                           ScalarCostsTy &ScalarCosts,
1821                                           ElementCount VF);
1822 
1823   /// Collect the instructions that are uniform after vectorization. An
1824   /// instruction is uniform if we represent it with a single scalar value in
1825   /// the vectorized loop corresponding to each vector iteration. Examples of
1826   /// uniform instructions include pointer operands of consecutive or
1827   /// interleaved memory accesses. Note that although uniformity implies an
1828   /// instruction will be scalar, the reverse is not true. In general, a
1829   /// scalarized instruction will be represented by VF scalar values in the
1830   /// vectorized loop, each corresponding to an iteration of the original
1831   /// scalar loop.
1832   void collectLoopUniforms(ElementCount VF);
1833 
1834   /// Collect the instructions that are scalar after vectorization. An
1835   /// instruction is scalar if it is known to be uniform or will be scalarized
1836   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1837   /// to the list if they are used by a load/store instruction that is marked as
1838   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1839   /// VF values in the vectorized loop, each corresponding to an iteration of
1840   /// the original scalar loop.
1841   void collectLoopScalars(ElementCount VF);
1842 
1843   /// Keeps cost model vectorization decision and cost for instructions.
1844   /// Right now it is used for memory instructions only.
1845   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1846                                 std::pair<InstWidening, InstructionCost>>;
1847 
1848   DecisionList WideningDecisions;
1849 
1850   using CallDecisionList =
1851       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1852 
1853   CallDecisionList CallWideningDecisions;
1854 
1855   /// Returns true if \p V is expected to be vectorized and it needs to be
1856   /// extracted.
1857   bool needsExtract(Value *V, ElementCount VF) const {
1858     Instruction *I = dyn_cast<Instruction>(V);
1859     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1860         TheLoop->isLoopInvariant(I))
1861       return false;
1862 
1863     // Assume we can vectorize V (and hence we need extraction) if the
1864     // scalars are not computed yet. This can happen, because it is called
1865     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1866     // the scalars are collected. That should be a safe assumption in most
1867     // cases, because we check if the operands have vectorizable types
1868     // beforehand in LoopVectorizationLegality.
1869     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1870   };
1871 
1872   /// Returns a range containing only operands needing to be extracted.
1873   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1874                                                    ElementCount VF) const {
1875     return SmallVector<Value *, 4>(make_filter_range(
1876         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1877   }
1878 
1879 public:
1880   /// The loop that we evaluate.
1881   Loop *TheLoop;
1882 
1883   /// Predicated scalar evolution analysis.
1884   PredicatedScalarEvolution &PSE;
1885 
1886   /// Loop Info analysis.
1887   LoopInfo *LI;
1888 
1889   /// Vectorization legality.
1890   LoopVectorizationLegality *Legal;
1891 
1892   /// Vector target information.
1893   const TargetTransformInfo &TTI;
1894 
1895   /// Target Library Info.
1896   const TargetLibraryInfo *TLI;
1897 
1898   /// Demanded bits analysis.
1899   DemandedBits *DB;
1900 
1901   /// Assumption cache.
1902   AssumptionCache *AC;
1903 
1904   /// Interface to emit optimization remarks.
1905   OptimizationRemarkEmitter *ORE;
1906 
1907   const Function *TheFunction;
1908 
1909   /// Loop Vectorize Hint.
1910   const LoopVectorizeHints *Hints;
1911 
1912   /// The interleave access information contains groups of interleaved accesses
1913   /// with the same stride and close to each other.
1914   InterleavedAccessInfo &InterleaveInfo;
1915 
1916   /// Values to ignore in the cost model.
1917   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1918 
1919   /// Values to ignore in the cost model when VF > 1.
1920   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1921 
1922   /// All element types found in the loop.
1923   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1924 };
1925 } // end namespace llvm
1926 
1927 namespace {
1928 /// Helper struct to manage generating runtime checks for vectorization.
1929 ///
1930 /// The runtime checks are created up-front in temporary blocks to allow better
1931 /// estimating the cost and un-linked from the existing IR. After deciding to
1932 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1933 /// temporary blocks are completely removed.
1934 class GeneratedRTChecks {
1935   /// Basic block which contains the generated SCEV checks, if any.
1936   BasicBlock *SCEVCheckBlock = nullptr;
1937 
1938   /// The value representing the result of the generated SCEV checks. If it is
1939   /// nullptr, either no SCEV checks have been generated or they have been used.
1940   Value *SCEVCheckCond = nullptr;
1941 
1942   /// Basic block which contains the generated memory runtime checks, if any.
1943   BasicBlock *MemCheckBlock = nullptr;
1944 
1945   /// The value representing the result of the generated memory runtime checks.
1946   /// If it is nullptr, either no memory runtime checks have been generated or
1947   /// they have been used.
1948   Value *MemRuntimeCheckCond = nullptr;
1949 
1950   DominatorTree *DT;
1951   LoopInfo *LI;
1952   TargetTransformInfo *TTI;
1953 
1954   SCEVExpander SCEVExp;
1955   SCEVExpander MemCheckExp;
1956 
1957   bool CostTooHigh = false;
1958   const bool AddBranchWeights;
1959 
1960 public:
1961   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1962                     TargetTransformInfo *TTI, const DataLayout &DL,
1963                     bool AddBranchWeights)
1964       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1965         MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1966 
1967   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1968   /// accurately estimate the cost of the runtime checks. The blocks are
1969   /// un-linked from the IR and is added back during vector code generation. If
1970   /// there is no vector code generation, the check blocks are removed
1971   /// completely.
1972   void Create(Loop *L, const LoopAccessInfo &LAI,
1973               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1974 
1975     // Hard cutoff to limit compile-time increase in case a very large number of
1976     // runtime checks needs to be generated.
1977     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1978     // profile info.
1979     CostTooHigh =
1980         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1981     if (CostTooHigh)
1982       return;
1983 
1984     BasicBlock *LoopHeader = L->getHeader();
1985     BasicBlock *Preheader = L->getLoopPreheader();
1986 
1987     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1988     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1989     // may be used by SCEVExpander. The blocks will be un-linked from their
1990     // predecessors and removed from LI & DT at the end of the function.
1991     if (!UnionPred.isAlwaysTrue()) {
1992       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1993                                   nullptr, "vector.scevcheck");
1994 
1995       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1996           &UnionPred, SCEVCheckBlock->getTerminator());
1997     }
1998 
1999     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2000     if (RtPtrChecking.Need) {
2001       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2002       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2003                                  "vector.memcheck");
2004 
2005       auto DiffChecks = RtPtrChecking.getDiffChecks();
2006       if (DiffChecks) {
2007         Value *RuntimeVF = nullptr;
2008         MemRuntimeCheckCond = addDiffRuntimeChecks(
2009             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
2010             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
2011               if (!RuntimeVF)
2012                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2013               return RuntimeVF;
2014             },
2015             IC);
2016       } else {
2017         MemRuntimeCheckCond = addRuntimeChecks(
2018             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
2019             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
2020       }
2021       assert(MemRuntimeCheckCond &&
2022              "no RT checks generated although RtPtrChecking "
2023              "claimed checks are required");
2024     }
2025 
2026     if (!MemCheckBlock && !SCEVCheckBlock)
2027       return;
2028 
2029     // Unhook the temporary block with the checks, update various places
2030     // accordingly.
2031     if (SCEVCheckBlock)
2032       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2033     if (MemCheckBlock)
2034       MemCheckBlock->replaceAllUsesWith(Preheader);
2035 
2036     if (SCEVCheckBlock) {
2037       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2038       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2039       Preheader->getTerminator()->eraseFromParent();
2040     }
2041     if (MemCheckBlock) {
2042       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2043       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2044       Preheader->getTerminator()->eraseFromParent();
2045     }
2046 
2047     DT->changeImmediateDominator(LoopHeader, Preheader);
2048     if (MemCheckBlock) {
2049       DT->eraseNode(MemCheckBlock);
2050       LI->removeBlock(MemCheckBlock);
2051     }
2052     if (SCEVCheckBlock) {
2053       DT->eraseNode(SCEVCheckBlock);
2054       LI->removeBlock(SCEVCheckBlock);
2055     }
2056   }
2057 
2058   InstructionCost getCost() {
2059     if (SCEVCheckBlock || MemCheckBlock)
2060       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2061 
2062     if (CostTooHigh) {
2063       InstructionCost Cost;
2064       Cost.setInvalid();
2065       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
2066       return Cost;
2067     }
2068 
2069     InstructionCost RTCheckCost = 0;
2070     if (SCEVCheckBlock)
2071       for (Instruction &I : *SCEVCheckBlock) {
2072         if (SCEVCheckBlock->getTerminator() == &I)
2073           continue;
2074         InstructionCost C =
2075             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2076         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2077         RTCheckCost += C;
2078       }
2079     if (MemCheckBlock)
2080       for (Instruction &I : *MemCheckBlock) {
2081         if (MemCheckBlock->getTerminator() == &I)
2082           continue;
2083         InstructionCost C =
2084             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2085         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2086         RTCheckCost += C;
2087       }
2088 
2089     if (SCEVCheckBlock || MemCheckBlock)
2090       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2091                         << "\n");
2092 
2093     return RTCheckCost;
2094   }
2095 
2096   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2097   /// unused.
2098   ~GeneratedRTChecks() {
2099     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2100     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2101     if (!SCEVCheckCond)
2102       SCEVCleaner.markResultUsed();
2103 
2104     if (!MemRuntimeCheckCond)
2105       MemCheckCleaner.markResultUsed();
2106 
2107     if (MemRuntimeCheckCond) {
2108       auto &SE = *MemCheckExp.getSE();
2109       // Memory runtime check generation creates compares that use expanded
2110       // values. Remove them before running the SCEVExpanderCleaners.
2111       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2112         if (MemCheckExp.isInsertedInstruction(&I))
2113           continue;
2114         SE.forgetValue(&I);
2115         I.eraseFromParent();
2116       }
2117     }
2118     MemCheckCleaner.cleanup();
2119     SCEVCleaner.cleanup();
2120 
2121     if (SCEVCheckCond)
2122       SCEVCheckBlock->eraseFromParent();
2123     if (MemRuntimeCheckCond)
2124       MemCheckBlock->eraseFromParent();
2125   }
2126 
2127   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2128   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2129   /// depending on the generated condition.
2130   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2131                              BasicBlock *LoopVectorPreHeader,
2132                              BasicBlock *LoopExitBlock) {
2133     if (!SCEVCheckCond)
2134       return nullptr;
2135 
2136     Value *Cond = SCEVCheckCond;
2137     // Mark the check as used, to prevent it from being removed during cleanup.
2138     SCEVCheckCond = nullptr;
2139     if (auto *C = dyn_cast<ConstantInt>(Cond))
2140       if (C->isZero())
2141         return nullptr;
2142 
2143     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2144 
2145     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2146     // Create new preheader for vector loop.
2147     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2148       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2149 
2150     SCEVCheckBlock->getTerminator()->eraseFromParent();
2151     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2152     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2153                                                 SCEVCheckBlock);
2154 
2155     DT->addNewBlock(SCEVCheckBlock, Pred);
2156     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2157 
2158     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2159     if (AddBranchWeights)
2160       setBranchWeights(BI, SCEVCheckBypassWeights);
2161     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2162     return SCEVCheckBlock;
2163   }
2164 
2165   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2166   /// the branches to branch to the vector preheader or \p Bypass, depending on
2167   /// the generated condition.
2168   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2169                                    BasicBlock *LoopVectorPreHeader) {
2170     // Check if we generated code that checks in runtime if arrays overlap.
2171     if (!MemRuntimeCheckCond)
2172       return nullptr;
2173 
2174     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2175     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2176                                                 MemCheckBlock);
2177 
2178     DT->addNewBlock(MemCheckBlock, Pred);
2179     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2180     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2181 
2182     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2183       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2184 
2185     BranchInst &BI =
2186         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2187     if (AddBranchWeights) {
2188       setBranchWeights(BI, MemCheckBypassWeights);
2189     }
2190     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2191     MemCheckBlock->getTerminator()->setDebugLoc(
2192         Pred->getTerminator()->getDebugLoc());
2193 
2194     // Mark the check as used, to prevent it from being removed during cleanup.
2195     MemRuntimeCheckCond = nullptr;
2196     return MemCheckBlock;
2197   }
2198 };
2199 } // namespace
2200 
2201 static bool useActiveLaneMask(TailFoldingStyle Style) {
2202   return Style == TailFoldingStyle::Data ||
2203          Style == TailFoldingStyle::DataAndControlFlow ||
2204          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2205 }
2206 
2207 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2208   return Style == TailFoldingStyle::DataAndControlFlow ||
2209          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2210 }
2211 
2212 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2213 // vectorization. The loop needs to be annotated with #pragma omp simd
2214 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2215 // vector length information is not provided, vectorization is not considered
2216 // explicit. Interleave hints are not allowed either. These limitations will be
2217 // relaxed in the future.
2218 // Please, note that we are currently forced to abuse the pragma 'clang
2219 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2220 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2221 // provides *explicit vectorization hints* (LV can bypass legal checks and
2222 // assume that vectorization is legal). However, both hints are implemented
2223 // using the same metadata (llvm.loop.vectorize, processed by
2224 // LoopVectorizeHints). This will be fixed in the future when the native IR
2225 // representation for pragma 'omp simd' is introduced.
2226 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2227                                    OptimizationRemarkEmitter *ORE) {
2228   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2229   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2230 
2231   // Only outer loops with an explicit vectorization hint are supported.
2232   // Unannotated outer loops are ignored.
2233   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2234     return false;
2235 
2236   Function *Fn = OuterLp->getHeader()->getParent();
2237   if (!Hints.allowVectorization(Fn, OuterLp,
2238                                 true /*VectorizeOnlyWhenForced*/)) {
2239     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2240     return false;
2241   }
2242 
2243   if (Hints.getInterleave() > 1) {
2244     // TODO: Interleave support is future work.
2245     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2246                          "outer loops.\n");
2247     Hints.emitRemarkWithHints();
2248     return false;
2249   }
2250 
2251   return true;
2252 }
2253 
2254 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2255                                   OptimizationRemarkEmitter *ORE,
2256                                   SmallVectorImpl<Loop *> &V) {
2257   // Collect inner loops and outer loops without irreducible control flow. For
2258   // now, only collect outer loops that have explicit vectorization hints. If we
2259   // are stress testing the VPlan H-CFG construction, we collect the outermost
2260   // loop of every loop nest.
2261   if (L.isInnermost() || VPlanBuildStressTest ||
2262       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2263     LoopBlocksRPO RPOT(&L);
2264     RPOT.perform(LI);
2265     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2266       V.push_back(&L);
2267       // TODO: Collect inner loops inside marked outer loops in case
2268       // vectorization fails for the outer loop. Do not invoke
2269       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2270       // already known to be reducible. We can use an inherited attribute for
2271       // that.
2272       return;
2273     }
2274   }
2275   for (Loop *InnerL : L)
2276     collectSupportedLoops(*InnerL, LI, ORE, V);
2277 }
2278 
2279 //===----------------------------------------------------------------------===//
2280 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2281 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2282 //===----------------------------------------------------------------------===//
2283 
2284 /// Compute the transformed value of Index at offset StartValue using step
2285 /// StepValue.
2286 /// For integer induction, returns StartValue + Index * StepValue.
2287 /// For pointer induction, returns StartValue[Index * StepValue].
2288 /// FIXME: The newly created binary instructions should contain nsw/nuw
2289 /// flags, which can be found from the original scalar operations.
2290 static Value *
2291 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2292                      Value *Step,
2293                      InductionDescriptor::InductionKind InductionKind,
2294                      const BinaryOperator *InductionBinOp) {
2295   Type *StepTy = Step->getType();
2296   Value *CastedIndex = StepTy->isIntegerTy()
2297                            ? B.CreateSExtOrTrunc(Index, StepTy)
2298                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2299   if (CastedIndex != Index) {
2300     CastedIndex->setName(CastedIndex->getName() + ".cast");
2301     Index = CastedIndex;
2302   }
2303 
2304   // Note: the IR at this point is broken. We cannot use SE to create any new
2305   // SCEV and then expand it, hoping that SCEV's simplification will give us
2306   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2307   // lead to various SCEV crashes. So all we can do is to use builder and rely
2308   // on InstCombine for future simplifications. Here we handle some trivial
2309   // cases only.
2310   auto CreateAdd = [&B](Value *X, Value *Y) {
2311     assert(X->getType() == Y->getType() && "Types don't match!");
2312     if (auto *CX = dyn_cast<ConstantInt>(X))
2313       if (CX->isZero())
2314         return Y;
2315     if (auto *CY = dyn_cast<ConstantInt>(Y))
2316       if (CY->isZero())
2317         return X;
2318     return B.CreateAdd(X, Y);
2319   };
2320 
2321   // We allow X to be a vector type, in which case Y will potentially be
2322   // splatted into a vector with the same element count.
2323   auto CreateMul = [&B](Value *X, Value *Y) {
2324     assert(X->getType()->getScalarType() == Y->getType() &&
2325            "Types don't match!");
2326     if (auto *CX = dyn_cast<ConstantInt>(X))
2327       if (CX->isOne())
2328         return Y;
2329     if (auto *CY = dyn_cast<ConstantInt>(Y))
2330       if (CY->isOne())
2331         return X;
2332     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2333     if (XVTy && !isa<VectorType>(Y->getType()))
2334       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2335     return B.CreateMul(X, Y);
2336   };
2337 
2338   switch (InductionKind) {
2339   case InductionDescriptor::IK_IntInduction: {
2340     assert(!isa<VectorType>(Index->getType()) &&
2341            "Vector indices not supported for integer inductions yet");
2342     assert(Index->getType() == StartValue->getType() &&
2343            "Index type does not match StartValue type");
2344     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2345       return B.CreateSub(StartValue, Index);
2346     auto *Offset = CreateMul(Index, Step);
2347     return CreateAdd(StartValue, Offset);
2348   }
2349   case InductionDescriptor::IK_PtrInduction:
2350     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2351   case InductionDescriptor::IK_FpInduction: {
2352     assert(!isa<VectorType>(Index->getType()) &&
2353            "Vector indices not supported for FP inductions yet");
2354     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2355     assert(InductionBinOp &&
2356            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2357             InductionBinOp->getOpcode() == Instruction::FSub) &&
2358            "Original bin op should be defined for FP induction");
2359 
2360     Value *MulExp = B.CreateFMul(Step, Index);
2361     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2362                          "induction");
2363   }
2364   case InductionDescriptor::IK_NoInduction:
2365     return nullptr;
2366   }
2367   llvm_unreachable("invalid enum");
2368 }
2369 
2370 std::optional<unsigned> getMaxVScale(const Function &F,
2371                                      const TargetTransformInfo &TTI) {
2372   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2373     return MaxVScale;
2374 
2375   if (F.hasFnAttribute(Attribute::VScaleRange))
2376     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2377 
2378   return std::nullopt;
2379 }
2380 
2381 /// For the given VF and UF and maximum trip count computed for the loop, return
2382 /// whether the induction variable might overflow in the vectorized loop. If not,
2383 /// then we know a runtime overflow check always evaluates to false and can be
2384 /// removed.
2385 static bool isIndvarOverflowCheckKnownFalse(
2386     const LoopVectorizationCostModel *Cost,
2387     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2388   // Always be conservative if we don't know the exact unroll factor.
2389   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2390 
2391   Type *IdxTy = Cost->Legal->getWidestInductionType();
2392   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2393 
2394   // We know the runtime overflow check is known false iff the (max) trip-count
2395   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2396   // the vector loop induction variable.
2397   if (unsigned TC =
2398           Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2399     uint64_t MaxVF = VF.getKnownMinValue();
2400     if (VF.isScalable()) {
2401       std::optional<unsigned> MaxVScale =
2402           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2403       if (!MaxVScale)
2404         return false;
2405       MaxVF *= *MaxVScale;
2406     }
2407 
2408     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2409   }
2410 
2411   return false;
2412 }
2413 
2414 // Return whether we allow using masked interleave-groups (for dealing with
2415 // strided loads/stores that reside in predicated blocks, or for dealing
2416 // with gaps).
2417 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2418   // If an override option has been passed in for interleaved accesses, use it.
2419   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2420     return EnableMaskedInterleavedMemAccesses;
2421 
2422   return TTI.enableMaskedInterleavedAccessVectorization();
2423 }
2424 
2425 // Try to vectorize the interleave group that \p Instr belongs to.
2426 //
2427 // E.g. Translate following interleaved load group (factor = 3):
2428 //   for (i = 0; i < N; i+=3) {
2429 //     R = Pic[i];             // Member of index 0
2430 //     G = Pic[i+1];           // Member of index 1
2431 //     B = Pic[i+2];           // Member of index 2
2432 //     ... // do something to R, G, B
2433 //   }
2434 // To:
2435 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2436 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2437 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2438 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2439 //
2440 // Or translate following interleaved store group (factor = 3):
2441 //   for (i = 0; i < N; i+=3) {
2442 //     ... do something to R, G, B
2443 //     Pic[i]   = R;           // Member of index 0
2444 //     Pic[i+1] = G;           // Member of index 1
2445 //     Pic[i+2] = B;           // Member of index 2
2446 //   }
2447 // To:
2448 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2449 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2450 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2451 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2452 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2453 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2454     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2455     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2456     VPValue *BlockInMask, bool NeedsMaskForGaps) {
2457   Instruction *Instr = Group->getInsertPos();
2458   const DataLayout &DL = Instr->getModule()->getDataLayout();
2459 
2460   // Prepare for the vector type of the interleaved load/store.
2461   Type *ScalarTy = getLoadStoreType(Instr);
2462   unsigned InterleaveFactor = Group->getFactor();
2463   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2464 
2465   // Prepare for the new pointers.
2466   SmallVector<Value *, 2> AddrParts;
2467   unsigned Index = Group->getIndex(Instr);
2468 
2469   // TODO: extend the masked interleaved-group support to reversed access.
2470   assert((!BlockInMask || !Group->isReverse()) &&
2471          "Reversed masked interleave-group not supported.");
2472 
2473   Value *Idx;
2474   // If the group is reverse, adjust the index to refer to the last vector lane
2475   // instead of the first. We adjust the index from the first vector lane,
2476   // rather than directly getting the pointer for lane VF - 1, because the
2477   // pointer operand of the interleaved access is supposed to be uniform. For
2478   // uniform instructions, we're only required to generate a value for the
2479   // first vector lane in each unroll iteration.
2480   if (Group->isReverse()) {
2481     Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2482     Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2483     Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
2484     Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
2485     Idx = Builder.CreateNeg(Idx);
2486   } else
2487     Idx = Builder.getInt32(-Index);
2488 
2489   for (unsigned Part = 0; Part < UF; Part++) {
2490     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2491     if (auto *I = dyn_cast<Instruction>(AddrPart))
2492       State.setDebugLocFrom(I->getDebugLoc());
2493 
2494     // Notice current instruction could be any index. Need to adjust the address
2495     // to the member of index 0.
2496     //
2497     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2498     //       b = A[i];       // Member of index 0
2499     // Current pointer is pointed to A[i+1], adjust it to A[i].
2500     //
2501     // E.g.  A[i+1] = a;     // Member of index 1
2502     //       A[i]   = b;     // Member of index 0
2503     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2504     // Current pointer is pointed to A[i+2], adjust it to A[i].
2505 
2506     bool InBounds = false;
2507     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2508       InBounds = gep->isInBounds();
2509     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2510     AddrParts.push_back(AddrPart);
2511   }
2512 
2513   State.setDebugLocFrom(Instr->getDebugLoc());
2514   Value *PoisonVec = PoisonValue::get(VecTy);
2515 
2516   auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2517                              unsigned Part, Value *MaskForGaps) -> Value * {
2518     if (VF.isScalable()) {
2519       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2520       assert(InterleaveFactor == 2 &&
2521              "Unsupported deinterleave factor for scalable vectors");
2522       auto *BlockInMaskPart = State.get(BlockInMask, Part);
2523       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2524       auto *MaskTy =
2525           VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
2526       return Builder.CreateIntrinsic(
2527           MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2528           /*FMFSource=*/nullptr, "interleaved.mask");
2529     }
2530 
2531     if (!BlockInMask)
2532       return MaskForGaps;
2533 
2534     Value *BlockInMaskPart = State.get(BlockInMask, Part);
2535     Value *ShuffledMask = Builder.CreateShuffleVector(
2536         BlockInMaskPart,
2537         createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2538         "interleaved.mask");
2539     return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2540                                              MaskForGaps)
2541                        : ShuffledMask;
2542   };
2543 
2544   // Vectorize the interleaved load group.
2545   if (isa<LoadInst>(Instr)) {
2546     Value *MaskForGaps = nullptr;
2547     if (NeedsMaskForGaps) {
2548       MaskForGaps =
2549           createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2550       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2551     }
2552 
2553     // For each unroll part, create a wide load for the group.
2554     SmallVector<Value *, 2> NewLoads;
2555     for (unsigned Part = 0; Part < UF; Part++) {
2556       Instruction *NewLoad;
2557       if (BlockInMask || MaskForGaps) {
2558         assert(useMaskedInterleavedAccesses(*TTI) &&
2559                "masked interleaved groups are not allowed.");
2560         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2561         NewLoad =
2562             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2563                                      GroupMask, PoisonVec, "wide.masked.vec");
2564       }
2565       else
2566         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2567                                             Group->getAlign(), "wide.vec");
2568       Group->addMetadata(NewLoad);
2569       NewLoads.push_back(NewLoad);
2570     }
2571 
2572     if (VecTy->isScalableTy()) {
2573       assert(InterleaveFactor == 2 &&
2574              "Unsupported deinterleave factor for scalable vectors");
2575 
2576       for (unsigned Part = 0; Part < UF; ++Part) {
2577         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2578         // so must use intrinsics to deinterleave.
2579         Value *DI = Builder.CreateIntrinsic(
2580             Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2581             /*FMFSource=*/nullptr, "strided.vec");
2582         unsigned J = 0;
2583         for (unsigned I = 0; I < InterleaveFactor; ++I) {
2584           Instruction *Member = Group->getMember(I);
2585 
2586           if (!Member)
2587             continue;
2588 
2589           Value *StridedVec = Builder.CreateExtractValue(DI, I);
2590           // If this member has different type, cast the result type.
2591           if (Member->getType() != ScalarTy) {
2592             VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2593             StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2594           }
2595 
2596           if (Group->isReverse())
2597             StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2598 
2599           State.set(VPDefs[J], StridedVec, Part);
2600           ++J;
2601         }
2602       }
2603 
2604       return;
2605     }
2606 
2607     // For each member in the group, shuffle out the appropriate data from the
2608     // wide loads.
2609     unsigned J = 0;
2610     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2611       Instruction *Member = Group->getMember(I);
2612 
2613       // Skip the gaps in the group.
2614       if (!Member)
2615         continue;
2616 
2617       auto StrideMask =
2618           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2619       for (unsigned Part = 0; Part < UF; Part++) {
2620         Value *StridedVec = Builder.CreateShuffleVector(
2621             NewLoads[Part], StrideMask, "strided.vec");
2622 
2623         // If this member has different type, cast the result type.
2624         if (Member->getType() != ScalarTy) {
2625           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2626           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2627           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2628         }
2629 
2630         if (Group->isReverse())
2631           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2632 
2633         State.set(VPDefs[J], StridedVec, Part);
2634       }
2635       ++J;
2636     }
2637     return;
2638   }
2639 
2640   // The sub vector type for current instruction.
2641   auto *SubVT = VectorType::get(ScalarTy, VF);
2642 
2643   // Vectorize the interleaved store group.
2644   Value *MaskForGaps =
2645       createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2646   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2647          "masked interleaved groups are not allowed.");
2648   assert((!MaskForGaps || !VF.isScalable()) &&
2649          "masking gaps for scalable vectors is not yet supported.");
2650   for (unsigned Part = 0; Part < UF; Part++) {
2651     // Collect the stored vector from each member.
2652     SmallVector<Value *, 4> StoredVecs;
2653     unsigned StoredIdx = 0;
2654     for (unsigned i = 0; i < InterleaveFactor; i++) {
2655       assert((Group->getMember(i) || MaskForGaps) &&
2656              "Fail to get a member from an interleaved store group");
2657       Instruction *Member = Group->getMember(i);
2658 
2659       // Skip the gaps in the group.
2660       if (!Member) {
2661         Value *Undef = PoisonValue::get(SubVT);
2662         StoredVecs.push_back(Undef);
2663         continue;
2664       }
2665 
2666       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2667       ++StoredIdx;
2668 
2669       if (Group->isReverse())
2670         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2671 
2672       // If this member has different type, cast it to a unified type.
2673 
2674       if (StoredVec->getType() != SubVT)
2675         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2676 
2677       StoredVecs.push_back(StoredVec);
2678     }
2679 
2680     // Interleave all the smaller vectors into one wider vector.
2681     Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2682     Instruction *NewStoreInstr;
2683     if (BlockInMask || MaskForGaps) {
2684       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2685       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2686                                                 Group->getAlign(), GroupMask);
2687     } else
2688       NewStoreInstr =
2689           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2690 
2691     Group->addMetadata(NewStoreInstr);
2692   }
2693 }
2694 
2695 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2696                                                VPReplicateRecipe *RepRecipe,
2697                                                const VPIteration &Instance,
2698                                                VPTransformState &State) {
2699   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2700 
2701   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2702   // the first lane and part.
2703   if (isa<NoAliasScopeDeclInst>(Instr))
2704     if (!Instance.isFirstIteration())
2705       return;
2706 
2707   // Does this instruction return a value ?
2708   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2709 
2710   Instruction *Cloned = Instr->clone();
2711   if (!IsVoidRetTy) {
2712     Cloned->setName(Instr->getName() + ".cloned");
2713 #if !defined(NDEBUG)
2714     // Verify that VPlan type inference results agree with the type of the
2715     // generated values.
2716     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2717            "inferred type and type from generated instructions do not match");
2718 #endif
2719   }
2720 
2721   RepRecipe->setFlags(Cloned);
2722 
2723   if (auto DL = Instr->getDebugLoc())
2724     State.setDebugLocFrom(DL);
2725 
2726   // Replace the operands of the cloned instructions with their scalar
2727   // equivalents in the new loop.
2728   for (const auto &I : enumerate(RepRecipe->operands())) {
2729     auto InputInstance = Instance;
2730     VPValue *Operand = I.value();
2731     if (vputils::isUniformAfterVectorization(Operand))
2732       InputInstance.Lane = VPLane::getFirstLane();
2733     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2734   }
2735   State.addNewMetadata(Cloned, Instr);
2736 
2737   // Place the cloned scalar in the new loop.
2738   State.Builder.Insert(Cloned);
2739 
2740   State.set(RepRecipe, Cloned, Instance);
2741 
2742   // If we just cloned a new assumption, add it the assumption cache.
2743   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2744     AC->registerAssumption(II);
2745 
2746   // End if-block.
2747   bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2748   if (IfPredicateInstr)
2749     PredicatedInstructions.push_back(Cloned);
2750 }
2751 
2752 Value *
2753 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2754   if (VectorTripCount)
2755     return VectorTripCount;
2756 
2757   Value *TC = getTripCount();
2758   IRBuilder<> Builder(InsertBlock->getTerminator());
2759 
2760   Type *Ty = TC->getType();
2761   // This is where we can make the step a runtime constant.
2762   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2763 
2764   // If the tail is to be folded by masking, round the number of iterations N
2765   // up to a multiple of Step instead of rounding down. This is done by first
2766   // adding Step-1 and then rounding down. Note that it's ok if this addition
2767   // overflows: the vector induction variable will eventually wrap to zero given
2768   // that it starts at zero and its Step is a power of two; the loop will then
2769   // exit, with the last early-exit vector comparison also producing all-true.
2770   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2771   // is accounted for in emitIterationCountCheck that adds an overflow check.
2772   if (Cost->foldTailByMasking()) {
2773     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2774            "VF*UF must be a power of 2 when folding tail by masking");
2775     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2776     TC = Builder.CreateAdd(
2777         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2778   }
2779 
2780   // Now we need to generate the expression for the part of the loop that the
2781   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2782   // iterations are not required for correctness, or N - Step, otherwise. Step
2783   // is equal to the vectorization factor (number of SIMD elements) times the
2784   // unroll factor (number of SIMD instructions).
2785   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2786 
2787   // There are cases where we *must* run at least one iteration in the remainder
2788   // loop.  See the cost model for when this can happen.  If the step evenly
2789   // divides the trip count, we set the remainder to be equal to the step. If
2790   // the step does not evenly divide the trip count, no adjustment is necessary
2791   // since there will already be scalar iterations. Note that the minimum
2792   // iterations check ensures that N >= Step.
2793   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2794     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2795     R = Builder.CreateSelect(IsZero, Step, R);
2796   }
2797 
2798   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2799 
2800   return VectorTripCount;
2801 }
2802 
2803 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2804                                                    const DataLayout &DL) {
2805   // Verify that V is a vector type with same number of elements as DstVTy.
2806   auto *DstFVTy = cast<VectorType>(DstVTy);
2807   auto VF = DstFVTy->getElementCount();
2808   auto *SrcVecTy = cast<VectorType>(V->getType());
2809   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2810   Type *SrcElemTy = SrcVecTy->getElementType();
2811   Type *DstElemTy = DstFVTy->getElementType();
2812   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2813          "Vector elements must have same size");
2814 
2815   // Do a direct cast if element types are castable.
2816   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2817     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2818   }
2819   // V cannot be directly casted to desired vector type.
2820   // May happen when V is a floating point vector but DstVTy is a vector of
2821   // pointers or vice-versa. Handle this using a two-step bitcast using an
2822   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2823   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2824          "Only one type should be a pointer type");
2825   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2826          "Only one type should be a floating point type");
2827   Type *IntTy =
2828       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2829   auto *VecIntTy = VectorType::get(IntTy, VF);
2830   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2831   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2832 }
2833 
2834 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2835   Value *Count = getTripCount();
2836   // Reuse existing vector loop preheader for TC checks.
2837   // Note that new preheader block is generated for vector loop.
2838   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2839   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2840 
2841   // Generate code to check if the loop's trip count is less than VF * UF, or
2842   // equal to it in case a scalar epilogue is required; this implies that the
2843   // vector trip count is zero. This check also covers the case where adding one
2844   // to the backedge-taken count overflowed leading to an incorrect trip count
2845   // of zero. In this case we will also jump to the scalar loop.
2846   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2847                                                        : ICmpInst::ICMP_ULT;
2848 
2849   // If tail is to be folded, vector loop takes care of all iterations.
2850   Type *CountTy = Count->getType();
2851   Value *CheckMinIters = Builder.getFalse();
2852   auto CreateStep = [&]() -> Value * {
2853     // Create step with max(MinProTripCount, UF * VF).
2854     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2855       return createStepForVF(Builder, CountTy, VF, UF);
2856 
2857     Value *MinProfTC =
2858         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2859     if (!VF.isScalable())
2860       return MinProfTC;
2861     return Builder.CreateBinaryIntrinsic(
2862         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2863   };
2864 
2865   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2866   if (Style == TailFoldingStyle::None)
2867     CheckMinIters =
2868         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2869   else if (VF.isScalable() &&
2870            !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2871            Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2872     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2873     // an overflow to zero when updating induction variables and so an
2874     // additional overflow check is required before entering the vector loop.
2875 
2876     // Get the maximum unsigned value for the type.
2877     Value *MaxUIntTripCount =
2878         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2879     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2880 
2881     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2882     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2883   }
2884 
2885   // Create new preheader for vector loop.
2886   LoopVectorPreHeader =
2887       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2888                  "vector.ph");
2889 
2890   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2891                                DT->getNode(Bypass)->getIDom()) &&
2892          "TC check is expected to dominate Bypass");
2893 
2894   // Update dominator for Bypass & LoopExit (if needed).
2895   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2896   if (!Cost->requiresScalarEpilogue(VF.isVector()))
2897     // If there is an epilogue which must run, there's no edge from the
2898     // middle block to exit blocks  and thus no need to update the immediate
2899     // dominator of the exit blocks.
2900     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2901 
2902   BranchInst &BI =
2903       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2904   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2905     setBranchWeights(BI, MinItersBypassWeights);
2906   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2907   LoopBypassBlocks.push_back(TCCheckBlock);
2908 }
2909 
2910 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2911   BasicBlock *const SCEVCheckBlock =
2912       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2913   if (!SCEVCheckBlock)
2914     return nullptr;
2915 
2916   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2917            (OptForSizeBasedOnProfile &&
2918             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2919          "Cannot SCEV check stride or overflow when optimizing for size");
2920 
2921 
2922   // Update dominator only if this is first RT check.
2923   if (LoopBypassBlocks.empty()) {
2924     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2925     if (!Cost->requiresScalarEpilogue(VF.isVector()))
2926       // If there is an epilogue which must run, there's no edge from the
2927       // middle block to exit blocks  and thus no need to update the immediate
2928       // dominator of the exit blocks.
2929       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2930   }
2931 
2932   LoopBypassBlocks.push_back(SCEVCheckBlock);
2933   AddedSafetyChecks = true;
2934   return SCEVCheckBlock;
2935 }
2936 
2937 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2938   // VPlan-native path does not do any analysis for runtime checks currently.
2939   if (EnableVPlanNativePath)
2940     return nullptr;
2941 
2942   BasicBlock *const MemCheckBlock =
2943       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2944 
2945   // Check if we generated code that checks in runtime if arrays overlap. We put
2946   // the checks into a separate block to make the more common case of few
2947   // elements faster.
2948   if (!MemCheckBlock)
2949     return nullptr;
2950 
2951   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2952     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2953            "Cannot emit memory checks when optimizing for size, unless forced "
2954            "to vectorize.");
2955     ORE->emit([&]() {
2956       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2957                                         OrigLoop->getStartLoc(),
2958                                         OrigLoop->getHeader())
2959              << "Code-size may be reduced by not forcing "
2960                 "vectorization, or by source-code modifications "
2961                 "eliminating the need for runtime checks "
2962                 "(e.g., adding 'restrict').";
2963     });
2964   }
2965 
2966   LoopBypassBlocks.push_back(MemCheckBlock);
2967 
2968   AddedSafetyChecks = true;
2969 
2970   return MemCheckBlock;
2971 }
2972 
2973 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2974   LoopScalarBody = OrigLoop->getHeader();
2975   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2976   assert(LoopVectorPreHeader && "Invalid loop structure");
2977   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2978   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2979          "multiple exit loop without required epilogue?");
2980 
2981   LoopMiddleBlock =
2982       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2983                  LI, nullptr, Twine(Prefix) + "middle.block");
2984   LoopScalarPreHeader =
2985       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2986                  nullptr, Twine(Prefix) + "scalar.ph");
2987 
2988   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2989 
2990   // Set up the middle block terminator.  Two cases:
2991   // 1) If we know that we must execute the scalar epilogue, emit an
2992   //    unconditional branch.
2993   // 2) Otherwise, we must have a single unique exit block (due to how we
2994   //    implement the multiple exit case).  In this case, set up a conditional
2995   //    branch from the middle block to the loop scalar preheader, and the
2996   //    exit block.  completeLoopSkeleton will update the condition to use an
2997   //    iteration check, if required to decide whether to execute the remainder.
2998   BranchInst *BrInst =
2999       Cost->requiresScalarEpilogue(VF.isVector())
3000           ? BranchInst::Create(LoopScalarPreHeader)
3001           : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3002                                Builder.getTrue());
3003   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3004   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3005 
3006   // Update dominator for loop exit. During skeleton creation, only the vector
3007   // pre-header and the middle block are created. The vector loop is entirely
3008   // created during VPlan exection.
3009   if (!Cost->requiresScalarEpilogue(VF.isVector()))
3010     // If there is an epilogue which must run, there's no edge from the
3011     // middle block to exit blocks  and thus no need to update the immediate
3012     // dominator of the exit blocks.
3013     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3014 }
3015 
3016 PHINode *InnerLoopVectorizer::createInductionResumeValue(
3017     PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3018     ArrayRef<BasicBlock *> BypassBlocks,
3019     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3020   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3021   assert(VectorTripCount && "Expected valid arguments");
3022 
3023   Instruction *OldInduction = Legal->getPrimaryInduction();
3024   Value *&EndValue = IVEndValues[OrigPhi];
3025   Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3026   if (OrigPhi == OldInduction) {
3027     // We know what the end value is.
3028     EndValue = VectorTripCount;
3029   } else {
3030     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3031 
3032     // Fast-math-flags propagate from the original induction instruction.
3033     if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3034       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3035 
3036     EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
3037                                     Step, II.getKind(), II.getInductionBinOp());
3038     EndValue->setName("ind.end");
3039 
3040     // Compute the end value for the additional bypass (if applicable).
3041     if (AdditionalBypass.first) {
3042       B.SetInsertPoint(AdditionalBypass.first,
3043                        AdditionalBypass.first->getFirstInsertionPt());
3044       EndValueFromAdditionalBypass =
3045           emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3046                                Step, II.getKind(), II.getInductionBinOp());
3047       EndValueFromAdditionalBypass->setName("ind.end");
3048     }
3049   }
3050 
3051   // Create phi nodes to merge from the  backedge-taken check block.
3052   PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3053                                          LoopScalarPreHeader->getTerminator());
3054   // Copy original phi DL over to the new one.
3055   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3056 
3057   // The new PHI merges the original incoming value, in case of a bypass,
3058   // or the value at the end of the vectorized loop.
3059   BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3060 
3061   // Fix the scalar body counter (PHI node).
3062   // The old induction's phi node in the scalar body needs the truncated
3063   // value.
3064   for (BasicBlock *BB : BypassBlocks)
3065     BCResumeVal->addIncoming(II.getStartValue(), BB);
3066 
3067   if (AdditionalBypass.first)
3068     BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3069                                           EndValueFromAdditionalBypass);
3070   return BCResumeVal;
3071 }
3072 
3073 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3074 /// expansion results.
3075 static Value *getExpandedStep(const InductionDescriptor &ID,
3076                               const SCEV2ValueTy &ExpandedSCEVs) {
3077   const SCEV *Step = ID.getStep();
3078   if (auto *C = dyn_cast<SCEVConstant>(Step))
3079     return C->getValue();
3080   if (auto *U = dyn_cast<SCEVUnknown>(Step))
3081     return U->getValue();
3082   auto I = ExpandedSCEVs.find(Step);
3083   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3084   return I->second;
3085 }
3086 
3087 void InnerLoopVectorizer::createInductionResumeValues(
3088     const SCEV2ValueTy &ExpandedSCEVs,
3089     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3090   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3091           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3092          "Inconsistent information about additional bypass.");
3093   // We are going to resume the execution of the scalar loop.
3094   // Go over all of the induction variables that we found and fix the
3095   // PHIs that are left in the scalar version of the loop.
3096   // The starting values of PHI nodes depend on the counter of the last
3097   // iteration in the vectorized loop.
3098   // If we come from a bypass edge then we need to start from the original
3099   // start value.
3100   for (const auto &InductionEntry : Legal->getInductionVars()) {
3101     PHINode *OrigPhi = InductionEntry.first;
3102     const InductionDescriptor &II = InductionEntry.second;
3103     PHINode *BCResumeVal = createInductionResumeValue(
3104         OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3105         AdditionalBypass);
3106     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3107   }
3108 }
3109 
3110 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3111   // The trip counts should be cached by now.
3112   Value *Count = getTripCount();
3113   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3114 
3115   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3116 
3117   // Add a check in the middle block to see if we have completed
3118   // all of the iterations in the first vector loop.  Three cases:
3119   // 1) If we require a scalar epilogue, there is no conditional branch as
3120   //    we unconditionally branch to the scalar preheader.  Do nothing.
3121   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3122   //    Thus if tail is to be folded, we know we don't need to run the
3123   //    remainder and we can use the previous value for the condition (true).
3124   // 3) Otherwise, construct a runtime check.
3125   if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3126       !Cost->foldTailByMasking()) {
3127     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3128     // of the corresponding compare because they may have ended up with
3129     // different line numbers and we want to avoid awkward line stepping while
3130     // debugging. Eg. if the compare has got a line number inside the loop.
3131     // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3132     // operands. Perform simplification directly on VPlan once the branch is
3133     // modeled there.
3134     IRBuilder<> B(LoopMiddleBlock->getTerminator());
3135     B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3136     Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3137     BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3138     BI.setCondition(CmpN);
3139     if (hasBranchWeightMD(*ScalarLatchTerm)) {
3140       // Assume that `Count % VectorTripCount` is equally distributed.
3141       unsigned TripCount = UF * VF.getKnownMinValue();
3142       assert(TripCount > 0 && "trip count should not be zero");
3143       const uint32_t Weights[] = {1, TripCount - 1};
3144       setBranchWeights(BI, Weights);
3145     }
3146   }
3147 
3148 #ifdef EXPENSIVE_CHECKS
3149   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3150 #endif
3151 
3152   return LoopVectorPreHeader;
3153 }
3154 
3155 std::pair<BasicBlock *, Value *>
3156 InnerLoopVectorizer::createVectorizedLoopSkeleton(
3157     const SCEV2ValueTy &ExpandedSCEVs) {
3158   /*
3159    In this function we generate a new loop. The new loop will contain
3160    the vectorized instructions while the old loop will continue to run the
3161    scalar remainder.
3162 
3163        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3164      /  |      preheader are expanded here. Eventually all required SCEV
3165     /   |      expansion should happen here.
3166    /    v
3167   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3168   |  /  |
3169   | /   v
3170   ||   [ ]     <-- vector pre header.
3171   |/    |
3172   |     v
3173   |    [  ] \
3174   |    [  ]_|   <-- vector loop (created during VPlan execution).
3175   |     |
3176   |     v
3177   \   -[ ]   <--- middle-block.
3178    \/   |
3179    /\   v
3180    | ->[ ]     <--- new preheader.
3181    |    |
3182  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3183    |   [ ] \
3184    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3185     \   |
3186      \  v
3187       >[ ]     <-- exit block(s).
3188    ...
3189    */
3190 
3191   // Create an empty vector loop, and prepare basic blocks for the runtime
3192   // checks.
3193   createVectorLoopSkeleton("");
3194 
3195   // Now, compare the new count to zero. If it is zero skip the vector loop and
3196   // jump to the scalar loop. This check also covers the case where the
3197   // backedge-taken count is uint##_max: adding one to it will overflow leading
3198   // to an incorrect trip count of zero. In this (rare) case we will also jump
3199   // to the scalar loop.
3200   emitIterationCountCheck(LoopScalarPreHeader);
3201 
3202   // Generate the code to check any assumptions that we've made for SCEV
3203   // expressions.
3204   emitSCEVChecks(LoopScalarPreHeader);
3205 
3206   // Generate the code that checks in runtime if arrays overlap. We put the
3207   // checks into a separate block to make the more common case of few elements
3208   // faster.
3209   emitMemRuntimeChecks(LoopScalarPreHeader);
3210 
3211   // Emit phis for the new starting index of the scalar loop.
3212   createInductionResumeValues(ExpandedSCEVs);
3213 
3214   return {completeLoopSkeleton(), nullptr};
3215 }
3216 
3217 // Fix up external users of the induction variable. At this point, we are
3218 // in LCSSA form, with all external PHIs that use the IV having one input value,
3219 // coming from the remainder loop. We need those PHIs to also have a correct
3220 // value for the IV when arriving directly from the middle block.
3221 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3222                                        const InductionDescriptor &II,
3223                                        Value *VectorTripCount, Value *EndValue,
3224                                        BasicBlock *MiddleBlock,
3225                                        BasicBlock *VectorHeader, VPlan &Plan,
3226                                        VPTransformState &State) {
3227   // There are two kinds of external IV usages - those that use the value
3228   // computed in the last iteration (the PHI) and those that use the penultimate
3229   // value (the value that feeds into the phi from the loop latch).
3230   // We allow both, but they, obviously, have different values.
3231 
3232   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3233 
3234   DenseMap<Value *, Value *> MissingVals;
3235 
3236   // An external user of the last iteration's value should see the value that
3237   // the remainder loop uses to initialize its own IV.
3238   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3239   for (User *U : PostInc->users()) {
3240     Instruction *UI = cast<Instruction>(U);
3241     if (!OrigLoop->contains(UI)) {
3242       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3243       MissingVals[UI] = EndValue;
3244     }
3245   }
3246 
3247   // An external user of the penultimate value need to see EndValue - Step.
3248   // The simplest way to get this is to recompute it from the constituent SCEVs,
3249   // that is Start + (Step * (CRD - 1)).
3250   for (User *U : OrigPhi->users()) {
3251     auto *UI = cast<Instruction>(U);
3252     if (!OrigLoop->contains(UI)) {
3253       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3254       IRBuilder<> B(MiddleBlock->getTerminator());
3255 
3256       // Fast-math-flags propagate from the original induction instruction.
3257       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3258         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3259 
3260       Value *CountMinusOne = B.CreateSub(
3261           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3262       CountMinusOne->setName("cmo");
3263 
3264       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3265       assert(StepVPV && "step must have been expanded during VPlan execution");
3266       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3267                                         : State.get(StepVPV, {0, 0});
3268       Value *Escape =
3269           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3270                                II.getKind(), II.getInductionBinOp());
3271       Escape->setName("ind.escape");
3272       MissingVals[UI] = Escape;
3273     }
3274   }
3275 
3276   for (auto &I : MissingVals) {
3277     PHINode *PHI = cast<PHINode>(I.first);
3278     // One corner case we have to handle is two IVs "chasing" each-other,
3279     // that is %IV2 = phi [...], [ %IV1, %latch ]
3280     // In this case, if IV1 has an external use, we need to avoid adding both
3281     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3282     // don't already have an incoming value for the middle block.
3283     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3284       PHI->addIncoming(I.second, MiddleBlock);
3285       Plan.removeLiveOut(PHI);
3286     }
3287   }
3288 }
3289 
3290 namespace {
3291 
3292 struct CSEDenseMapInfo {
3293   static bool canHandle(const Instruction *I) {
3294     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3295            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3296   }
3297 
3298   static inline Instruction *getEmptyKey() {
3299     return DenseMapInfo<Instruction *>::getEmptyKey();
3300   }
3301 
3302   static inline Instruction *getTombstoneKey() {
3303     return DenseMapInfo<Instruction *>::getTombstoneKey();
3304   }
3305 
3306   static unsigned getHashValue(const Instruction *I) {
3307     assert(canHandle(I) && "Unknown instruction!");
3308     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3309                                                            I->value_op_end()));
3310   }
3311 
3312   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3313     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3314         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3315       return LHS == RHS;
3316     return LHS->isIdenticalTo(RHS);
3317   }
3318 };
3319 
3320 } // end anonymous namespace
3321 
3322 ///Perform cse of induction variable instructions.
3323 static void cse(BasicBlock *BB) {
3324   // Perform simple cse.
3325   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3326   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3327     if (!CSEDenseMapInfo::canHandle(&In))
3328       continue;
3329 
3330     // Check if we can replace this instruction with any of the
3331     // visited instructions.
3332     if (Instruction *V = CSEMap.lookup(&In)) {
3333       In.replaceAllUsesWith(V);
3334       In.eraseFromParent();
3335       continue;
3336     }
3337 
3338     CSEMap[&In] = &In;
3339   }
3340 }
3341 
3342 InstructionCost
3343 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3344                                               ElementCount VF) const {
3345   // We only need to calculate a cost if the VF is scalar; for actual vectors
3346   // we should already have a pre-calculated cost at each VF.
3347   if (!VF.isScalar())
3348     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3349 
3350   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3351   Type *RetTy = CI->getType();
3352   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
3353     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3354       return *RedCost;
3355 
3356   SmallVector<Type *, 4> Tys;
3357   for (auto &ArgOp : CI->args())
3358     Tys.push_back(ArgOp->getType());
3359 
3360   InstructionCost ScalarCallCost =
3361       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
3362 
3363   // If this is an intrinsic we may have a lower cost for it.
3364   if (getVectorIntrinsicIDForCall(CI, TLI)) {
3365     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3366     return std::min(ScalarCallCost, IntrinsicCost);
3367   }
3368   return ScalarCallCost;
3369 }
3370 
3371 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3372   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3373     return Elt;
3374   return VectorType::get(Elt, VF);
3375 }
3376 
3377 InstructionCost
3378 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3379                                                    ElementCount VF) const {
3380   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3381   assert(ID && "Expected intrinsic call!");
3382   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3383   FastMathFlags FMF;
3384   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3385     FMF = FPMO->getFastMathFlags();
3386 
3387   SmallVector<const Value *> Arguments(CI->args());
3388   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3389   SmallVector<Type *> ParamTys;
3390   std::transform(FTy->param_begin(), FTy->param_end(),
3391                  std::back_inserter(ParamTys),
3392                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3393 
3394   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3395                                     dyn_cast<IntrinsicInst>(CI));
3396   return TTI.getIntrinsicInstrCost(CostAttrs,
3397                                    TargetTransformInfo::TCK_RecipThroughput);
3398 }
3399 
3400 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3401   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3402   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3403   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3404 }
3405 
3406 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3407   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3408   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3409   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3410 }
3411 
3412 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3413                                             VPlan &Plan) {
3414   // Fix widened non-induction PHIs by setting up the PHI operands.
3415   if (EnableVPlanNativePath)
3416     fixNonInductionPHIs(Plan, State);
3417 
3418   // At this point every instruction in the original loop is widened to a
3419   // vector form. Now we need to fix the recurrences in the loop. These PHI
3420   // nodes are currently empty because we did not want to introduce cycles.
3421   // This is the second stage of vectorizing recurrences. Note that fixing
3422   // reduction phis are already modeled in VPlan.
3423   // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3424   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3425   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3426   for (VPRecipeBase &R : HeaderVPBB->phis()) {
3427     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3428       fixFixedOrderRecurrence(FOR, State);
3429   }
3430 
3431   // Forget the original basic block.
3432   PSE.getSE()->forgetLoop(OrigLoop);
3433   PSE.getSE()->forgetBlockAndLoopDispositions();
3434 
3435   // After vectorization, the exit blocks of the original loop will have
3436   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3437   // looked through single-entry phis.
3438   SmallVector<BasicBlock *> ExitBlocks;
3439   OrigLoop->getExitBlocks(ExitBlocks);
3440   for (BasicBlock *Exit : ExitBlocks)
3441     for (PHINode &PN : Exit->phis())
3442       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3443 
3444   VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3445   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3446   if (Cost->requiresScalarEpilogue(VF.isVector())) {
3447     // No edge from the middle block to the unique exit block has been inserted
3448     // and there is nothing to fix from vector loop; phis should have incoming
3449     // from scalar loop only.
3450   } else {
3451     // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3452     // the cost model.
3453 
3454     // If we inserted an edge from the middle block to the unique exit block,
3455     // update uses outside the loop (phis) to account for the newly inserted
3456     // edge.
3457 
3458     // Fix-up external users of the induction variables.
3459     for (const auto &Entry : Legal->getInductionVars())
3460       fixupIVUsers(Entry.first, Entry.second,
3461                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3462                    IVEndValues[Entry.first], LoopMiddleBlock,
3463                    VectorLoop->getHeader(), Plan, State);
3464   }
3465 
3466   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3467   // in the exit block, so update the builder.
3468   State.Builder.SetInsertPoint(State.CFG.ExitBB,
3469                                State.CFG.ExitBB->getFirstNonPHIIt());
3470   for (const auto &KV : Plan.getLiveOuts())
3471     KV.second->fixPhi(Plan, State);
3472 
3473   for (Instruction *PI : PredicatedInstructions)
3474     sinkScalarOperands(&*PI);
3475 
3476   // Remove redundant induction instructions.
3477   cse(VectorLoop->getHeader());
3478 
3479   // Set/update profile weights for the vector and remainder loops as original
3480   // loop iterations are now distributed among them. Note that original loop
3481   // represented by LoopScalarBody becomes remainder loop after vectorization.
3482   //
3483   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3484   // end up getting slightly roughened result but that should be OK since
3485   // profile is not inherently precise anyway. Note also possible bypass of
3486   // vector code caused by legality checks is ignored, assigning all the weight
3487   // to the vector loop, optimistically.
3488   //
3489   // For scalable vectorization we can't know at compile time how many iterations
3490   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3491   // vscale of '1'.
3492   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3493                                LI->getLoopFor(LoopScalarBody),
3494                                VF.getKnownMinValue() * UF);
3495 }
3496 
3497 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3498     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3499   // This is the second phase of vectorizing first-order recurrences. An
3500   // overview of the transformation is described below. Suppose we have the
3501   // following loop.
3502   //
3503   //   for (int i = 0; i < n; ++i)
3504   //     b[i] = a[i] - a[i - 1];
3505   //
3506   // There is a first-order recurrence on "a". For this loop, the shorthand
3507   // scalar IR looks like:
3508   //
3509   //   scalar.ph:
3510   //     s_init = a[-1]
3511   //     br scalar.body
3512   //
3513   //   scalar.body:
3514   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3515   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3516   //     s2 = a[i]
3517   //     b[i] = s2 - s1
3518   //     br cond, scalar.body, ...
3519   //
3520   // In this example, s1 is a recurrence because it's value depends on the
3521   // previous iteration. In the first phase of vectorization, we created a
3522   // vector phi v1 for s1. We now complete the vectorization and produce the
3523   // shorthand vector IR shown below (for VF = 4, UF = 1).
3524   //
3525   //   vector.ph:
3526   //     v_init = vector(..., ..., ..., a[-1])
3527   //     br vector.body
3528   //
3529   //   vector.body
3530   //     i = phi [0, vector.ph], [i+4, vector.body]
3531   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3532   //     v2 = a[i, i+1, i+2, i+3];
3533   //     v3 = vector(v1(3), v2(0, 1, 2))
3534   //     b[i, i+1, i+2, i+3] = v2 - v3
3535   //     br cond, vector.body, middle.block
3536   //
3537   //   middle.block:
3538   //     x = v2(3)
3539   //     br scalar.ph
3540   //
3541   //   scalar.ph:
3542   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3543   //     br scalar.body
3544   //
3545   // After execution completes the vector loop, we extract the next value of
3546   // the recurrence (x) to use as the initial value in the scalar loop.
3547 
3548   // Extract the last vector element in the middle block. This will be the
3549   // initial value for the recurrence when jumping to the scalar loop.
3550   VPValue *PreviousDef = PhiR->getBackedgeValue();
3551   Value *Incoming = State.get(PreviousDef, UF - 1);
3552   auto *ExtractForScalar = Incoming;
3553   auto *IdxTy = Builder.getInt32Ty();
3554   Value *RuntimeVF = nullptr;
3555   if (VF.isVector()) {
3556     auto *One = ConstantInt::get(IdxTy, 1);
3557     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3558     RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3559     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3560     ExtractForScalar =
3561         Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3562   }
3563 
3564   auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3565   assert(PhiR->getNumUsers() == 1 &&
3566          RecurSplice->getOpcode() ==
3567              VPInstruction::FirstOrderRecurrenceSplice &&
3568          "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3569   SmallVector<VPLiveOut *> LiveOuts;
3570   for (VPUser *U : RecurSplice->users())
3571     if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3572       LiveOuts.push_back(LiveOut);
3573 
3574   if (!LiveOuts.empty()) {
3575     // Extract the second last element in the middle block if the
3576     // Phi is used outside the loop. We need to extract the phi itself
3577     // and not the last element (the phi update in the current iteration). This
3578     // will be the value when jumping to the exit block from the
3579     // LoopMiddleBlock, when the scalar loop is not run at all.
3580     Value *ExtractForPhiUsedOutsideLoop = nullptr;
3581     if (VF.isVector()) {
3582       auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3583       ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3584           Incoming, Idx, "vector.recur.extract.for.phi");
3585     } else {
3586       assert(UF > 1 && "VF and UF cannot both be 1");
3587       // When loop is unrolled without vectorizing, initialize
3588       // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3589       // value of `Incoming`. This is analogous to the vectorized case above:
3590       // extracting the second last element when VF > 1.
3591       ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3592     }
3593 
3594     for (VPLiveOut *LiveOut : LiveOuts) {
3595       assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3596       PHINode *LCSSAPhi = LiveOut->getPhi();
3597       LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3598       State.Plan->removeLiveOut(LCSSAPhi);
3599     }
3600   }
3601 
3602   // Fix the initial value of the original recurrence in the scalar loop.
3603   Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
3604   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3605   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3606   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3607   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3608     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3609     Start->addIncoming(Incoming, BB);
3610   }
3611 
3612   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3613   Phi->setName("scalar.recur");
3614 }
3615 
3616 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3617   // The basic block and loop containing the predicated instruction.
3618   auto *PredBB = PredInst->getParent();
3619   auto *VectorLoop = LI->getLoopFor(PredBB);
3620 
3621   // Initialize a worklist with the operands of the predicated instruction.
3622   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3623 
3624   // Holds instructions that we need to analyze again. An instruction may be
3625   // reanalyzed if we don't yet know if we can sink it or not.
3626   SmallVector<Instruction *, 8> InstsToReanalyze;
3627 
3628   // Returns true if a given use occurs in the predicated block. Phi nodes use
3629   // their operands in their corresponding predecessor blocks.
3630   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3631     auto *I = cast<Instruction>(U.getUser());
3632     BasicBlock *BB = I->getParent();
3633     if (auto *Phi = dyn_cast<PHINode>(I))
3634       BB = Phi->getIncomingBlock(
3635           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3636     return BB == PredBB;
3637   };
3638 
3639   // Iteratively sink the scalarized operands of the predicated instruction
3640   // into the block we created for it. When an instruction is sunk, it's
3641   // operands are then added to the worklist. The algorithm ends after one pass
3642   // through the worklist doesn't sink a single instruction.
3643   bool Changed;
3644   do {
3645     // Add the instructions that need to be reanalyzed to the worklist, and
3646     // reset the changed indicator.
3647     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3648     InstsToReanalyze.clear();
3649     Changed = false;
3650 
3651     while (!Worklist.empty()) {
3652       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3653 
3654       // We can't sink an instruction if it is a phi node, is not in the loop,
3655       // may have side effects or may read from memory.
3656       // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3657       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3658           I->mayHaveSideEffects() || I->mayReadFromMemory())
3659           continue;
3660 
3661       // If the instruction is already in PredBB, check if we can sink its
3662       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3663       // sinking the scalar instruction I, hence it appears in PredBB; but it
3664       // may have failed to sink I's operands (recursively), which we try
3665       // (again) here.
3666       if (I->getParent() == PredBB) {
3667         Worklist.insert(I->op_begin(), I->op_end());
3668         continue;
3669       }
3670 
3671       // It's legal to sink the instruction if all its uses occur in the
3672       // predicated block. Otherwise, there's nothing to do yet, and we may
3673       // need to reanalyze the instruction.
3674       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3675         InstsToReanalyze.push_back(I);
3676         continue;
3677       }
3678 
3679       // Move the instruction to the beginning of the predicated block, and add
3680       // it's operands to the worklist.
3681       I->moveBefore(&*PredBB->getFirstInsertionPt());
3682       Worklist.insert(I->op_begin(), I->op_end());
3683 
3684       // The sinking may have enabled other instructions to be sunk, so we will
3685       // need to iterate.
3686       Changed = true;
3687     }
3688   } while (Changed);
3689 }
3690 
3691 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3692                                               VPTransformState &State) {
3693   auto Iter = vp_depth_first_deep(Plan.getEntry());
3694   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3695     for (VPRecipeBase &P : VPBB->phis()) {
3696       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3697       if (!VPPhi)
3698         continue;
3699       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3700       // Make sure the builder has a valid insert point.
3701       Builder.SetInsertPoint(NewPhi);
3702       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3703         VPValue *Inc = VPPhi->getIncomingValue(i);
3704         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3705         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3706       }
3707     }
3708   }
3709 }
3710 
3711 bool InnerLoopVectorizer::useOrderedReductions(
3712     const RecurrenceDescriptor &RdxDesc) {
3713   return Cost->useOrderedReductions(RdxDesc);
3714 }
3715 
3716 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3717   // We should not collect Scalars more than once per VF. Right now, this
3718   // function is called from collectUniformsAndScalars(), which already does
3719   // this check. Collecting Scalars for VF=1 does not make any sense.
3720   assert(VF.isVector() && !Scalars.contains(VF) &&
3721          "This function should not be visited twice for the same VF");
3722 
3723   // This avoids any chances of creating a REPLICATE recipe during planning
3724   // since that would result in generation of scalarized code during execution,
3725   // which is not supported for scalable vectors.
3726   if (VF.isScalable()) {
3727     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3728     return;
3729   }
3730 
3731   SmallSetVector<Instruction *, 8> Worklist;
3732 
3733   // These sets are used to seed the analysis with pointers used by memory
3734   // accesses that will remain scalar.
3735   SmallSetVector<Instruction *, 8> ScalarPtrs;
3736   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3737   auto *Latch = TheLoop->getLoopLatch();
3738 
3739   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3740   // The pointer operands of loads and stores will be scalar as long as the
3741   // memory access is not a gather or scatter operation. The value operand of a
3742   // store will remain scalar if the store is scalarized.
3743   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3744     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3745     assert(WideningDecision != CM_Unknown &&
3746            "Widening decision should be ready at this moment");
3747     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3748       if (Ptr == Store->getValueOperand())
3749         return WideningDecision == CM_Scalarize;
3750     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3751            "Ptr is neither a value or pointer operand");
3752     return WideningDecision != CM_GatherScatter;
3753   };
3754 
3755   // A helper that returns true if the given value is a bitcast or
3756   // getelementptr instruction contained in the loop.
3757   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3758     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3759             isa<GetElementPtrInst>(V)) &&
3760            !TheLoop->isLoopInvariant(V);
3761   };
3762 
3763   // A helper that evaluates a memory access's use of a pointer. If the use will
3764   // be a scalar use and the pointer is only used by memory accesses, we place
3765   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3766   // PossibleNonScalarPtrs.
3767   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3768     // We only care about bitcast and getelementptr instructions contained in
3769     // the loop.
3770     if (!isLoopVaryingBitCastOrGEP(Ptr))
3771       return;
3772 
3773     // If the pointer has already been identified as scalar (e.g., if it was
3774     // also identified as uniform), there's nothing to do.
3775     auto *I = cast<Instruction>(Ptr);
3776     if (Worklist.count(I))
3777       return;
3778 
3779     // If the use of the pointer will be a scalar use, and all users of the
3780     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3781     // place the pointer in PossibleNonScalarPtrs.
3782     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3783           return isa<LoadInst>(U) || isa<StoreInst>(U);
3784         }))
3785       ScalarPtrs.insert(I);
3786     else
3787       PossibleNonScalarPtrs.insert(I);
3788   };
3789 
3790   // We seed the scalars analysis with three classes of instructions: (1)
3791   // instructions marked uniform-after-vectorization and (2) bitcast,
3792   // getelementptr and (pointer) phi instructions used by memory accesses
3793   // requiring a scalar use.
3794   //
3795   // (1) Add to the worklist all instructions that have been identified as
3796   // uniform-after-vectorization.
3797   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3798 
3799   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3800   // memory accesses requiring a scalar use. The pointer operands of loads and
3801   // stores will be scalar as long as the memory accesses is not a gather or
3802   // scatter operation. The value operand of a store will remain scalar if the
3803   // store is scalarized.
3804   for (auto *BB : TheLoop->blocks())
3805     for (auto &I : *BB) {
3806       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3807         evaluatePtrUse(Load, Load->getPointerOperand());
3808       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3809         evaluatePtrUse(Store, Store->getPointerOperand());
3810         evaluatePtrUse(Store, Store->getValueOperand());
3811       }
3812     }
3813   for (auto *I : ScalarPtrs)
3814     if (!PossibleNonScalarPtrs.count(I)) {
3815       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3816       Worklist.insert(I);
3817     }
3818 
3819   // Insert the forced scalars.
3820   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3821   // induction variable when the PHI user is scalarized.
3822   auto ForcedScalar = ForcedScalars.find(VF);
3823   if (ForcedScalar != ForcedScalars.end())
3824     for (auto *I : ForcedScalar->second) {
3825       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3826       Worklist.insert(I);
3827     }
3828 
3829   // Expand the worklist by looking through any bitcasts and getelementptr
3830   // instructions we've already identified as scalar. This is similar to the
3831   // expansion step in collectLoopUniforms(); however, here we're only
3832   // expanding to include additional bitcasts and getelementptr instructions.
3833   unsigned Idx = 0;
3834   while (Idx != Worklist.size()) {
3835     Instruction *Dst = Worklist[Idx++];
3836     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3837       continue;
3838     auto *Src = cast<Instruction>(Dst->getOperand(0));
3839     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3840           auto *J = cast<Instruction>(U);
3841           return !TheLoop->contains(J) || Worklist.count(J) ||
3842                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3843                   isScalarUse(J, Src));
3844         })) {
3845       Worklist.insert(Src);
3846       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3847     }
3848   }
3849 
3850   // An induction variable will remain scalar if all users of the induction
3851   // variable and induction variable update remain scalar.
3852   for (const auto &Induction : Legal->getInductionVars()) {
3853     auto *Ind = Induction.first;
3854     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3855 
3856     // If tail-folding is applied, the primary induction variable will be used
3857     // to feed a vector compare.
3858     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3859       continue;
3860 
3861     // Returns true if \p Indvar is a pointer induction that is used directly by
3862     // load/store instruction \p I.
3863     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3864                                               Instruction *I) {
3865       return Induction.second.getKind() ==
3866                  InductionDescriptor::IK_PtrInduction &&
3867              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3868              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3869     };
3870 
3871     // Determine if all users of the induction variable are scalar after
3872     // vectorization.
3873     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3874       auto *I = cast<Instruction>(U);
3875       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3876              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3877     });
3878     if (!ScalarInd)
3879       continue;
3880 
3881     // Determine if all users of the induction variable update instruction are
3882     // scalar after vectorization.
3883     auto ScalarIndUpdate =
3884         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3885           auto *I = cast<Instruction>(U);
3886           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3887                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3888         });
3889     if (!ScalarIndUpdate)
3890       continue;
3891 
3892     // The induction variable and its update instruction will remain scalar.
3893     Worklist.insert(Ind);
3894     Worklist.insert(IndUpdate);
3895     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3896     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3897                       << "\n");
3898   }
3899 
3900   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3901 }
3902 
3903 bool LoopVectorizationCostModel::isScalarWithPredication(
3904     Instruction *I, ElementCount VF) const {
3905   if (!isPredicatedInst(I))
3906     return false;
3907 
3908   // Do we have a non-scalar lowering for this predicated
3909   // instruction? No - it is scalar with predication.
3910   switch(I->getOpcode()) {
3911   default:
3912     return true;
3913   case Instruction::Call:
3914     if (VF.isScalar())
3915       return true;
3916     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3917                .Kind == CM_Scalarize;
3918   case Instruction::Load:
3919   case Instruction::Store: {
3920     auto *Ptr = getLoadStorePointerOperand(I);
3921     auto *Ty = getLoadStoreType(I);
3922     Type *VTy = Ty;
3923     if (VF.isVector())
3924       VTy = VectorType::get(Ty, VF);
3925     const Align Alignment = getLoadStoreAlignment(I);
3926     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3927                                 TTI.isLegalMaskedGather(VTy, Alignment))
3928                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3929                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3930   }
3931   case Instruction::UDiv:
3932   case Instruction::SDiv:
3933   case Instruction::SRem:
3934   case Instruction::URem: {
3935     // We have the option to use the safe-divisor idiom to avoid predication.
3936     // The cost based decision here will always select safe-divisor for
3937     // scalable vectors as scalarization isn't legal.
3938     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3939     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3940   }
3941   }
3942 }
3943 
3944 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3945   if (!blockNeedsPredicationForAnyReason(I->getParent()))
3946     return false;
3947 
3948   // Can we prove this instruction is safe to unconditionally execute?
3949   // If not, we must use some form of predication.
3950   switch(I->getOpcode()) {
3951   default:
3952     return false;
3953   case Instruction::Load:
3954   case Instruction::Store: {
3955     if (!Legal->isMaskRequired(I))
3956       return false;
3957     // When we know the load's address is loop invariant and the instruction
3958     // in the original scalar loop was unconditionally executed then we
3959     // don't need to mark it as a predicated instruction. Tail folding may
3960     // introduce additional predication, but we're guaranteed to always have
3961     // at least one active lane.  We call Legal->blockNeedsPredication here
3962     // because it doesn't query tail-folding.  For stores, we need to prove
3963     // both speculation safety (which follows from the same argument as loads),
3964     // but also must prove the value being stored is correct.  The easiest
3965     // form of the later is to require that all values stored are the same.
3966     if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3967         (isa<LoadInst>(I) ||
3968          (isa<StoreInst>(I) &&
3969           TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3970         !Legal->blockNeedsPredication(I->getParent()))
3971       return false;
3972     return true;
3973   }
3974   case Instruction::UDiv:
3975   case Instruction::SDiv:
3976   case Instruction::SRem:
3977   case Instruction::URem:
3978     // TODO: We can use the loop-preheader as context point here and get
3979     // context sensitive reasoning
3980     return !isSafeToSpeculativelyExecute(I);
3981   case Instruction::Call:
3982     return Legal->isMaskRequired(I);
3983   }
3984 }
3985 
3986 std::pair<InstructionCost, InstructionCost>
3987 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3988                                                     ElementCount VF) const {
3989   assert(I->getOpcode() == Instruction::UDiv ||
3990          I->getOpcode() == Instruction::SDiv ||
3991          I->getOpcode() == Instruction::SRem ||
3992          I->getOpcode() == Instruction::URem);
3993   assert(!isSafeToSpeculativelyExecute(I));
3994 
3995   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3996 
3997   // Scalarization isn't legal for scalable vector types
3998   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3999   if (!VF.isScalable()) {
4000     // Get the scalarization cost and scale this amount by the probability of
4001     // executing the predicated block. If the instruction is not predicated,
4002     // we fall through to the next case.
4003     ScalarizationCost = 0;
4004 
4005     // These instructions have a non-void type, so account for the phi nodes
4006     // that we will create. This cost is likely to be zero. The phi node
4007     // cost, if any, should be scaled by the block probability because it
4008     // models a copy at the end of each predicated block.
4009     ScalarizationCost += VF.getKnownMinValue() *
4010       TTI.getCFInstrCost(Instruction::PHI, CostKind);
4011 
4012     // The cost of the non-predicated instruction.
4013     ScalarizationCost += VF.getKnownMinValue() *
4014       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4015 
4016     // The cost of insertelement and extractelement instructions needed for
4017     // scalarization.
4018     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4019 
4020     // Scale the cost by the probability of executing the predicated blocks.
4021     // This assumes the predicated block for each vector lane is equally
4022     // likely.
4023     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4024   }
4025   InstructionCost SafeDivisorCost = 0;
4026 
4027   auto *VecTy = ToVectorTy(I->getType(), VF);
4028 
4029   // The cost of the select guard to ensure all lanes are well defined
4030   // after we speculate above any internal control flow.
4031   SafeDivisorCost += TTI.getCmpSelInstrCost(
4032     Instruction::Select, VecTy,
4033     ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4034     CmpInst::BAD_ICMP_PREDICATE, CostKind);
4035 
4036   // Certain instructions can be cheaper to vectorize if they have a constant
4037   // second vector operand. One example of this are shifts on x86.
4038   Value *Op2 = I->getOperand(1);
4039   auto Op2Info = TTI.getOperandInfo(Op2);
4040   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4041       Legal->isInvariant(Op2))
4042     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4043 
4044   SmallVector<const Value *, 4> Operands(I->operand_values());
4045   SafeDivisorCost += TTI.getArithmeticInstrCost(
4046     I->getOpcode(), VecTy, CostKind,
4047     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4048     Op2Info, Operands, I);
4049   return {ScalarizationCost, SafeDivisorCost};
4050 }
4051 
4052 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4053     Instruction *I, ElementCount VF) {
4054   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4055   assert(getWideningDecision(I, VF) == CM_Unknown &&
4056          "Decision should not be set yet.");
4057   auto *Group = getInterleavedAccessGroup(I);
4058   assert(Group && "Must have a group.");
4059 
4060   // If the instruction's allocated size doesn't equal it's type size, it
4061   // requires padding and will be scalarized.
4062   auto &DL = I->getModule()->getDataLayout();
4063   auto *ScalarTy = getLoadStoreType(I);
4064   if (hasIrregularType(ScalarTy, DL))
4065     return false;
4066 
4067   // If the group involves a non-integral pointer, we may not be able to
4068   // losslessly cast all values to a common type.
4069   unsigned InterleaveFactor = Group->getFactor();
4070   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4071   for (unsigned i = 0; i < InterleaveFactor; i++) {
4072     Instruction *Member = Group->getMember(i);
4073     if (!Member)
4074       continue;
4075     auto *MemberTy = getLoadStoreType(Member);
4076     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4077     // Don't coerce non-integral pointers to integers or vice versa.
4078     if (MemberNI != ScalarNI) {
4079       // TODO: Consider adding special nullptr value case here
4080       return false;
4081     } else if (MemberNI && ScalarNI &&
4082                ScalarTy->getPointerAddressSpace() !=
4083                MemberTy->getPointerAddressSpace()) {
4084       return false;
4085     }
4086   }
4087 
4088   // Check if masking is required.
4089   // A Group may need masking for one of two reasons: it resides in a block that
4090   // needs predication, or it was decided to use masking to deal with gaps
4091   // (either a gap at the end of a load-access that may result in a speculative
4092   // load, or any gaps in a store-access).
4093   bool PredicatedAccessRequiresMasking =
4094       blockNeedsPredicationForAnyReason(I->getParent()) &&
4095       Legal->isMaskRequired(I);
4096   bool LoadAccessWithGapsRequiresEpilogMasking =
4097       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4098       !isScalarEpilogueAllowed();
4099   bool StoreAccessWithGapsRequiresMasking =
4100       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4101   if (!PredicatedAccessRequiresMasking &&
4102       !LoadAccessWithGapsRequiresEpilogMasking &&
4103       !StoreAccessWithGapsRequiresMasking)
4104     return true;
4105 
4106   // If masked interleaving is required, we expect that the user/target had
4107   // enabled it, because otherwise it either wouldn't have been created or
4108   // it should have been invalidated by the CostModel.
4109   assert(useMaskedInterleavedAccesses(TTI) &&
4110          "Masked interleave-groups for predicated accesses are not enabled.");
4111 
4112   if (Group->isReverse())
4113     return false;
4114 
4115   auto *Ty = getLoadStoreType(I);
4116   const Align Alignment = getLoadStoreAlignment(I);
4117   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4118                           : TTI.isLegalMaskedStore(Ty, Alignment);
4119 }
4120 
4121 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4122     Instruction *I, ElementCount VF) {
4123   // Get and ensure we have a valid memory instruction.
4124   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4125 
4126   auto *Ptr = getLoadStorePointerOperand(I);
4127   auto *ScalarTy = getLoadStoreType(I);
4128 
4129   // In order to be widened, the pointer should be consecutive, first of all.
4130   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4131     return false;
4132 
4133   // If the instruction is a store located in a predicated block, it will be
4134   // scalarized.
4135   if (isScalarWithPredication(I, VF))
4136     return false;
4137 
4138   // If the instruction's allocated size doesn't equal it's type size, it
4139   // requires padding and will be scalarized.
4140   auto &DL = I->getModule()->getDataLayout();
4141   if (hasIrregularType(ScalarTy, DL))
4142     return false;
4143 
4144   return true;
4145 }
4146 
4147 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4148   // We should not collect Uniforms more than once per VF. Right now,
4149   // this function is called from collectUniformsAndScalars(), which
4150   // already does this check. Collecting Uniforms for VF=1 does not make any
4151   // sense.
4152 
4153   assert(VF.isVector() && !Uniforms.contains(VF) &&
4154          "This function should not be visited twice for the same VF");
4155 
4156   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4157   // not analyze again.  Uniforms.count(VF) will return 1.
4158   Uniforms[VF].clear();
4159 
4160   // We now know that the loop is vectorizable!
4161   // Collect instructions inside the loop that will remain uniform after
4162   // vectorization.
4163 
4164   // Global values, params and instructions outside of current loop are out of
4165   // scope.
4166   auto isOutOfScope = [&](Value *V) -> bool {
4167     Instruction *I = dyn_cast<Instruction>(V);
4168     return (!I || !TheLoop->contains(I));
4169   };
4170 
4171   // Worklist containing uniform instructions demanding lane 0.
4172   SetVector<Instruction *> Worklist;
4173   BasicBlock *Latch = TheLoop->getLoopLatch();
4174 
4175   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4176   // that are scalar with predication must not be considered uniform after
4177   // vectorization, because that would create an erroneous replicating region
4178   // where only a single instance out of VF should be formed.
4179   // TODO: optimize such seldom cases if found important, see PR40816.
4180   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4181     if (isOutOfScope(I)) {
4182       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4183                         << *I << "\n");
4184       return;
4185     }
4186     if (isScalarWithPredication(I, VF)) {
4187       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4188                         << *I << "\n");
4189       return;
4190     }
4191     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4192     Worklist.insert(I);
4193   };
4194 
4195   // Start with the conditional branch. If the branch condition is an
4196   // instruction contained in the loop that is only used by the branch, it is
4197   // uniform.
4198   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4199   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4200     addToWorklistIfAllowed(Cmp);
4201 
4202   auto PrevVF = VF.divideCoefficientBy(2);
4203   // Return true if all lanes perform the same memory operation, and we can
4204   // thus chose to execute only one.
4205   auto isUniformMemOpUse = [&](Instruction *I) {
4206     // If the value was already known to not be uniform for the previous
4207     // (smaller VF), it cannot be uniform for the larger VF.
4208     if (PrevVF.isVector()) {
4209       auto Iter = Uniforms.find(PrevVF);
4210       if (Iter != Uniforms.end() && !Iter->second.contains(I))
4211         return false;
4212     }
4213     if (!Legal->isUniformMemOp(*I, VF))
4214       return false;
4215     if (isa<LoadInst>(I))
4216       // Loading the same address always produces the same result - at least
4217       // assuming aliasing and ordering which have already been checked.
4218       return true;
4219     // Storing the same value on every iteration.
4220     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4221   };
4222 
4223   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4224     InstWidening WideningDecision = getWideningDecision(I, VF);
4225     assert(WideningDecision != CM_Unknown &&
4226            "Widening decision should be ready at this moment");
4227 
4228     if (isUniformMemOpUse(I))
4229       return true;
4230 
4231     return (WideningDecision == CM_Widen ||
4232             WideningDecision == CM_Widen_Reverse ||
4233             WideningDecision == CM_Interleave);
4234   };
4235 
4236   // Returns true if Ptr is the pointer operand of a memory access instruction
4237   // I, I is known to not require scalarization, and the pointer is not also
4238   // stored.
4239   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4240     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4241       return false;
4242     return getLoadStorePointerOperand(I) == Ptr &&
4243            (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4244   };
4245 
4246   // Holds a list of values which are known to have at least one uniform use.
4247   // Note that there may be other uses which aren't uniform.  A "uniform use"
4248   // here is something which only demands lane 0 of the unrolled iterations;
4249   // it does not imply that all lanes produce the same value (e.g. this is not
4250   // the usual meaning of uniform)
4251   SetVector<Value *> HasUniformUse;
4252 
4253   // Scan the loop for instructions which are either a) known to have only
4254   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4255   for (auto *BB : TheLoop->blocks())
4256     for (auto &I : *BB) {
4257       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4258         switch (II->getIntrinsicID()) {
4259         case Intrinsic::sideeffect:
4260         case Intrinsic::experimental_noalias_scope_decl:
4261         case Intrinsic::assume:
4262         case Intrinsic::lifetime_start:
4263         case Intrinsic::lifetime_end:
4264           if (TheLoop->hasLoopInvariantOperands(&I))
4265             addToWorklistIfAllowed(&I);
4266           break;
4267         default:
4268           break;
4269         }
4270       }
4271 
4272       // ExtractValue instructions must be uniform, because the operands are
4273       // known to be loop-invariant.
4274       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4275         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4276                "Expected aggregate value to be loop invariant");
4277         addToWorklistIfAllowed(EVI);
4278         continue;
4279       }
4280 
4281       // If there's no pointer operand, there's nothing to do.
4282       auto *Ptr = getLoadStorePointerOperand(&I);
4283       if (!Ptr)
4284         continue;
4285 
4286       if (isUniformMemOpUse(&I))
4287         addToWorklistIfAllowed(&I);
4288 
4289       if (isVectorizedMemAccessUse(&I, Ptr))
4290         HasUniformUse.insert(Ptr);
4291     }
4292 
4293   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4294   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4295   // disallows uses outside the loop as well.
4296   for (auto *V : HasUniformUse) {
4297     if (isOutOfScope(V))
4298       continue;
4299     auto *I = cast<Instruction>(V);
4300     auto UsersAreMemAccesses =
4301       llvm::all_of(I->users(), [&](User *U) -> bool {
4302         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4303       });
4304     if (UsersAreMemAccesses)
4305       addToWorklistIfAllowed(I);
4306   }
4307 
4308   // Expand Worklist in topological order: whenever a new instruction
4309   // is added , its users should be already inside Worklist.  It ensures
4310   // a uniform instruction will only be used by uniform instructions.
4311   unsigned idx = 0;
4312   while (idx != Worklist.size()) {
4313     Instruction *I = Worklist[idx++];
4314 
4315     for (auto *OV : I->operand_values()) {
4316       // isOutOfScope operands cannot be uniform instructions.
4317       if (isOutOfScope(OV))
4318         continue;
4319       // First order recurrence Phi's should typically be considered
4320       // non-uniform.
4321       auto *OP = dyn_cast<PHINode>(OV);
4322       if (OP && Legal->isFixedOrderRecurrence(OP))
4323         continue;
4324       // If all the users of the operand are uniform, then add the
4325       // operand into the uniform worklist.
4326       auto *OI = cast<Instruction>(OV);
4327       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4328             auto *J = cast<Instruction>(U);
4329             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4330           }))
4331         addToWorklistIfAllowed(OI);
4332     }
4333   }
4334 
4335   // For an instruction to be added into Worklist above, all its users inside
4336   // the loop should also be in Worklist. However, this condition cannot be
4337   // true for phi nodes that form a cyclic dependence. We must process phi
4338   // nodes separately. An induction variable will remain uniform if all users
4339   // of the induction variable and induction variable update remain uniform.
4340   // The code below handles both pointer and non-pointer induction variables.
4341   for (const auto &Induction : Legal->getInductionVars()) {
4342     auto *Ind = Induction.first;
4343     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4344 
4345     // Determine if all users of the induction variable are uniform after
4346     // vectorization.
4347     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4348       auto *I = cast<Instruction>(U);
4349       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4350              isVectorizedMemAccessUse(I, Ind);
4351     });
4352     if (!UniformInd)
4353       continue;
4354 
4355     // Determine if all users of the induction variable update instruction are
4356     // uniform after vectorization.
4357     auto UniformIndUpdate =
4358         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4359           auto *I = cast<Instruction>(U);
4360           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4361                  isVectorizedMemAccessUse(I, IndUpdate);
4362         });
4363     if (!UniformIndUpdate)
4364       continue;
4365 
4366     // The induction variable and its update instruction will remain uniform.
4367     addToWorklistIfAllowed(Ind);
4368     addToWorklistIfAllowed(IndUpdate);
4369   }
4370 
4371   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4372 }
4373 
4374 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4375   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4376 
4377   if (Legal->getRuntimePointerChecking()->Need) {
4378     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4379         "runtime pointer checks needed. Enable vectorization of this "
4380         "loop with '#pragma clang loop vectorize(enable)' when "
4381         "compiling with -Os/-Oz",
4382         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4383     return true;
4384   }
4385 
4386   if (!PSE.getPredicate().isAlwaysTrue()) {
4387     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4388         "runtime SCEV checks needed. Enable vectorization of this "
4389         "loop with '#pragma clang loop vectorize(enable)' when "
4390         "compiling with -Os/-Oz",
4391         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4392     return true;
4393   }
4394 
4395   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4396   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4397     reportVectorizationFailure("Runtime stride check for small trip count",
4398         "runtime stride == 1 checks needed. Enable vectorization of "
4399         "this loop without such check by compiling with -Os/-Oz",
4400         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4401     return true;
4402   }
4403 
4404   return false;
4405 }
4406 
4407 ElementCount
4408 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4409   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4410     return ElementCount::getScalable(0);
4411 
4412   if (Hints->isScalableVectorizationDisabled()) {
4413     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4414                             "ScalableVectorizationDisabled", ORE, TheLoop);
4415     return ElementCount::getScalable(0);
4416   }
4417 
4418   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4419 
4420   auto MaxScalableVF = ElementCount::getScalable(
4421       std::numeric_limits<ElementCount::ScalarTy>::max());
4422 
4423   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4424   // FIXME: While for scalable vectors this is currently sufficient, this should
4425   // be replaced by a more detailed mechanism that filters out specific VFs,
4426   // instead of invalidating vectorization for a whole set of VFs based on the
4427   // MaxVF.
4428 
4429   // Disable scalable vectorization if the loop contains unsupported reductions.
4430   if (!canVectorizeReductions(MaxScalableVF)) {
4431     reportVectorizationInfo(
4432         "Scalable vectorization not supported for the reduction "
4433         "operations found in this loop.",
4434         "ScalableVFUnfeasible", ORE, TheLoop);
4435     return ElementCount::getScalable(0);
4436   }
4437 
4438   // Disable scalable vectorization if the loop contains any instructions
4439   // with element types not supported for scalable vectors.
4440   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4441         return !Ty->isVoidTy() &&
4442                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4443       })) {
4444     reportVectorizationInfo("Scalable vectorization is not supported "
4445                             "for all element types found in this loop.",
4446                             "ScalableVFUnfeasible", ORE, TheLoop);
4447     return ElementCount::getScalable(0);
4448   }
4449 
4450   if (Legal->isSafeForAnyVectorWidth())
4451     return MaxScalableVF;
4452 
4453   // Limit MaxScalableVF by the maximum safe dependence distance.
4454   if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4455     MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4456   else
4457     MaxScalableVF = ElementCount::getScalable(0);
4458 
4459   if (!MaxScalableVF)
4460     reportVectorizationInfo(
4461         "Max legal vector width too small, scalable vectorization "
4462         "unfeasible.",
4463         "ScalableVFUnfeasible", ORE, TheLoop);
4464 
4465   return MaxScalableVF;
4466 }
4467 
4468 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4469     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4470   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4471   unsigned SmallestType, WidestType;
4472   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4473 
4474   // Get the maximum safe dependence distance in bits computed by LAA.
4475   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4476   // the memory accesses that is most restrictive (involved in the smallest
4477   // dependence distance).
4478   unsigned MaxSafeElements =
4479       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4480 
4481   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4482   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4483 
4484   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4485                     << ".\n");
4486   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4487                     << ".\n");
4488 
4489   // First analyze the UserVF, fall back if the UserVF should be ignored.
4490   if (UserVF) {
4491     auto MaxSafeUserVF =
4492         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4493 
4494     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4495       // If `VF=vscale x N` is safe, then so is `VF=N`
4496       if (UserVF.isScalable())
4497         return FixedScalableVFPair(
4498             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4499       else
4500         return UserVF;
4501     }
4502 
4503     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4504 
4505     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4506     // is better to ignore the hint and let the compiler choose a suitable VF.
4507     if (!UserVF.isScalable()) {
4508       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4509                         << " is unsafe, clamping to max safe VF="
4510                         << MaxSafeFixedVF << ".\n");
4511       ORE->emit([&]() {
4512         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4513                                           TheLoop->getStartLoc(),
4514                                           TheLoop->getHeader())
4515                << "User-specified vectorization factor "
4516                << ore::NV("UserVectorizationFactor", UserVF)
4517                << " is unsafe, clamping to maximum safe vectorization factor "
4518                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4519       });
4520       return MaxSafeFixedVF;
4521     }
4522 
4523     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4524       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4525                         << " is ignored because scalable vectors are not "
4526                            "available.\n");
4527       ORE->emit([&]() {
4528         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4529                                           TheLoop->getStartLoc(),
4530                                           TheLoop->getHeader())
4531                << "User-specified vectorization factor "
4532                << ore::NV("UserVectorizationFactor", UserVF)
4533                << " is ignored because the target does not support scalable "
4534                   "vectors. The compiler will pick a more suitable value.";
4535       });
4536     } else {
4537       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4538                         << " is unsafe. Ignoring scalable UserVF.\n");
4539       ORE->emit([&]() {
4540         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4541                                           TheLoop->getStartLoc(),
4542                                           TheLoop->getHeader())
4543                << "User-specified vectorization factor "
4544                << ore::NV("UserVectorizationFactor", UserVF)
4545                << " is unsafe. Ignoring the hint to let the compiler pick a "
4546                   "more suitable value.";
4547       });
4548     }
4549   }
4550 
4551   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4552                     << " / " << WidestType << " bits.\n");
4553 
4554   FixedScalableVFPair Result(ElementCount::getFixed(1),
4555                              ElementCount::getScalable(0));
4556   if (auto MaxVF =
4557           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4558                                   MaxSafeFixedVF, FoldTailByMasking))
4559     Result.FixedVF = MaxVF;
4560 
4561   if (auto MaxVF =
4562           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4563                                   MaxSafeScalableVF, FoldTailByMasking))
4564     if (MaxVF.isScalable()) {
4565       Result.ScalableVF = MaxVF;
4566       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4567                         << "\n");
4568     }
4569 
4570   return Result;
4571 }
4572 
4573 FixedScalableVFPair
4574 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4575   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4576     // TODO: It may by useful to do since it's still likely to be dynamically
4577     // uniform if the target can skip.
4578     reportVectorizationFailure(
4579         "Not inserting runtime ptr check for divergent target",
4580         "runtime pointer checks needed. Not enabled for divergent target",
4581         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4582     return FixedScalableVFPair::getNone();
4583   }
4584 
4585   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4586   unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4587   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4588   if (TC == 1) {
4589     reportVectorizationFailure("Single iteration (non) loop",
4590         "loop trip count is one, irrelevant for vectorization",
4591         "SingleIterationLoop", ORE, TheLoop);
4592     return FixedScalableVFPair::getNone();
4593   }
4594 
4595   switch (ScalarEpilogueStatus) {
4596   case CM_ScalarEpilogueAllowed:
4597     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4598   case CM_ScalarEpilogueNotAllowedUsePredicate:
4599     [[fallthrough]];
4600   case CM_ScalarEpilogueNotNeededUsePredicate:
4601     LLVM_DEBUG(
4602         dbgs() << "LV: vector predicate hint/switch found.\n"
4603                << "LV: Not allowing scalar epilogue, creating predicated "
4604                << "vector loop.\n");
4605     break;
4606   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4607     // fallthrough as a special case of OptForSize
4608   case CM_ScalarEpilogueNotAllowedOptSize:
4609     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4610       LLVM_DEBUG(
4611           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4612     else
4613       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4614                         << "count.\n");
4615 
4616     // Bail if runtime checks are required, which are not good when optimising
4617     // for size.
4618     if (runtimeChecksRequired())
4619       return FixedScalableVFPair::getNone();
4620 
4621     break;
4622   }
4623 
4624   // The only loops we can vectorize without a scalar epilogue, are loops with
4625   // a bottom-test and a single exiting block. We'd have to handle the fact
4626   // that not every instruction executes on the last iteration.  This will
4627   // require a lane mask which varies through the vector loop body.  (TODO)
4628   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4629     // If there was a tail-folding hint/switch, but we can't fold the tail by
4630     // masking, fallback to a vectorization with a scalar epilogue.
4631     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4632       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4633                            "scalar epilogue instead.\n");
4634       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4635       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4636     }
4637     return FixedScalableVFPair::getNone();
4638   }
4639 
4640   // Now try the tail folding
4641 
4642   // Invalidate interleave groups that require an epilogue if we can't mask
4643   // the interleave-group.
4644   if (!useMaskedInterleavedAccesses(TTI)) {
4645     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4646            "No decisions should have been taken at this point");
4647     // Note: There is no need to invalidate any cost modeling decisions here, as
4648     // non where taken so far.
4649     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4650   }
4651 
4652   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4653 
4654   // Avoid tail folding if the trip count is known to be a multiple of any VF
4655   // we choose.
4656   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4657       MaxFactors.FixedVF.getFixedValue();
4658   if (MaxFactors.ScalableVF) {
4659     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4660     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4661       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4662           *MaxPowerOf2RuntimeVF,
4663           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4664     } else
4665       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4666   }
4667 
4668   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4669     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4670            "MaxFixedVF must be a power of 2");
4671     unsigned MaxVFtimesIC =
4672         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4673     ScalarEvolution *SE = PSE.getSE();
4674     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4675     const SCEV *ExitCount = SE->getAddExpr(
4676         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4677     const SCEV *Rem = SE->getURemExpr(
4678         SE->applyLoopGuards(ExitCount, TheLoop),
4679         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4680     if (Rem->isZero()) {
4681       // Accept MaxFixedVF if we do not have a tail.
4682       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4683       return MaxFactors;
4684     }
4685   }
4686 
4687   // If we don't know the precise trip count, or if the trip count that we
4688   // found modulo the vectorization factor is not zero, try to fold the tail
4689   // by masking.
4690   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4691   if (Legal->prepareToFoldTailByMasking()) {
4692     CanFoldTailByMasking = true;
4693     return MaxFactors;
4694   }
4695 
4696   // If there was a tail-folding hint/switch, but we can't fold the tail by
4697   // masking, fallback to a vectorization with a scalar epilogue.
4698   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4699     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4700                          "scalar epilogue instead.\n");
4701     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4702     return MaxFactors;
4703   }
4704 
4705   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4706     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4707     return FixedScalableVFPair::getNone();
4708   }
4709 
4710   if (TC == 0) {
4711     reportVectorizationFailure(
4712         "Unable to calculate the loop count due to complex control flow",
4713         "unable to calculate the loop count due to complex control flow",
4714         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4715     return FixedScalableVFPair::getNone();
4716   }
4717 
4718   reportVectorizationFailure(
4719       "Cannot optimize for size and vectorize at the same time.",
4720       "cannot optimize for size and vectorize at the same time. "
4721       "Enable vectorization of this loop with '#pragma clang loop "
4722       "vectorize(enable)' when compiling with -Os/-Oz",
4723       "NoTailLoopWithOptForSize", ORE, TheLoop);
4724   return FixedScalableVFPair::getNone();
4725 }
4726 
4727 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4728     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4729     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4730   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4731   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4732       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4733                            : TargetTransformInfo::RGK_FixedWidthVector);
4734 
4735   // Convenience function to return the minimum of two ElementCounts.
4736   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4737     assert((LHS.isScalable() == RHS.isScalable()) &&
4738            "Scalable flags must match");
4739     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4740   };
4741 
4742   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4743   // Note that both WidestRegister and WidestType may not be a powers of 2.
4744   auto MaxVectorElementCount = ElementCount::get(
4745       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4746       ComputeScalableMaxVF);
4747   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4748   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4749                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4750 
4751   if (!MaxVectorElementCount) {
4752     LLVM_DEBUG(dbgs() << "LV: The target has no "
4753                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4754                       << " vector registers.\n");
4755     return ElementCount::getFixed(1);
4756   }
4757 
4758   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4759   if (MaxVectorElementCount.isScalable() &&
4760       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4761     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4762     auto Min = Attr.getVScaleRangeMin();
4763     WidestRegisterMinEC *= Min;
4764   }
4765 
4766   // When a scalar epilogue is required, at least one iteration of the scalar
4767   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4768   // max VF that results in a dead vector loop.
4769   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4770     MaxTripCount -= 1;
4771 
4772   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4773       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4774     // If upper bound loop trip count (TC) is known at compile time there is no
4775     // point in choosing VF greater than TC (as done in the loop below). Select
4776     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4777     // scalable, we only fall back on a fixed VF when the TC is less than or
4778     // equal to the known number of lanes.
4779     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4780     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4781                          "exceeding the constant trip count: "
4782                       << ClampedUpperTripCount << "\n");
4783     return ElementCount::get(
4784         ClampedUpperTripCount,
4785         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4786   }
4787 
4788   TargetTransformInfo::RegisterKind RegKind =
4789       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4790                            : TargetTransformInfo::RGK_FixedWidthVector;
4791   ElementCount MaxVF = MaxVectorElementCount;
4792   if (MaximizeBandwidth ||
4793       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4794        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4795         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4796     auto MaxVectorElementCountMaxBW = ElementCount::get(
4797         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4798         ComputeScalableMaxVF);
4799     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4800 
4801     // Collect all viable vectorization factors larger than the default MaxVF
4802     // (i.e. MaxVectorElementCount).
4803     SmallVector<ElementCount, 8> VFs;
4804     for (ElementCount VS = MaxVectorElementCount * 2;
4805          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4806       VFs.push_back(VS);
4807 
4808     // For each VF calculate its register usage.
4809     auto RUs = calculateRegisterUsage(VFs);
4810 
4811     // Select the largest VF which doesn't require more registers than existing
4812     // ones.
4813     for (int i = RUs.size() - 1; i >= 0; --i) {
4814       bool Selected = true;
4815       for (auto &pair : RUs[i].MaxLocalUsers) {
4816         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4817         if (pair.second > TargetNumRegisters)
4818           Selected = false;
4819       }
4820       if (Selected) {
4821         MaxVF = VFs[i];
4822         break;
4823       }
4824     }
4825     if (ElementCount MinVF =
4826             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4827       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4828         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4829                           << ") with target's minimum: " << MinVF << '\n');
4830         MaxVF = MinVF;
4831       }
4832     }
4833 
4834     // Invalidate any widening decisions we might have made, in case the loop
4835     // requires prediction (decided later), but we have already made some
4836     // load/store widening decisions.
4837     invalidateCostModelingDecisions();
4838   }
4839   return MaxVF;
4840 }
4841 
4842 /// Convenience function that returns the value of vscale_range iff
4843 /// vscale_range.min == vscale_range.max or otherwise returns the value
4844 /// returned by the corresponding TTI method.
4845 static std::optional<unsigned>
4846 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4847   const Function *Fn = L->getHeader()->getParent();
4848   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4849     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4850     auto Min = Attr.getVScaleRangeMin();
4851     auto Max = Attr.getVScaleRangeMax();
4852     if (Max && Min == Max)
4853       return Max;
4854   }
4855 
4856   return TTI.getVScaleForTuning();
4857 }
4858 
4859 bool LoopVectorizationPlanner::isMoreProfitable(
4860     const VectorizationFactor &A, const VectorizationFactor &B) const {
4861   InstructionCost CostA = A.Cost;
4862   InstructionCost CostB = B.Cost;
4863 
4864   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4865 
4866   if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4867     // If the trip count is a known (possibly small) constant, the trip count
4868     // will be rounded up to an integer number of iterations under
4869     // FoldTailByMasking. The total cost in that case will be
4870     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4871     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4872     // some extra overheads, but for the purpose of comparing the costs of
4873     // different VFs we can use this to compare the total loop-body cost
4874     // expected after vectorization.
4875     auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4876                                              InstructionCost VectorCost,
4877                                              InstructionCost ScalarCost) {
4878       return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4879                                     : VectorCost * (MaxTripCount / VF) +
4880                                           ScalarCost * (MaxTripCount % VF);
4881     };
4882     auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4883     auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4884 
4885     return RTCostA < RTCostB;
4886   }
4887 
4888   // Improve estimate for the vector width if it is scalable.
4889   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4890   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4891   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4892     if (A.Width.isScalable())
4893       EstimatedWidthA *= *VScale;
4894     if (B.Width.isScalable())
4895       EstimatedWidthB *= *VScale;
4896   }
4897 
4898   // Assume vscale may be larger than 1 (or the value being tuned for),
4899   // so that scalable vectorization is slightly favorable over fixed-width
4900   // vectorization.
4901   if (A.Width.isScalable() && !B.Width.isScalable())
4902     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4903 
4904   // To avoid the need for FP division:
4905   //      (CostA / A.Width) < (CostB / B.Width)
4906   // <=>  (CostA * B.Width) < (CostB * A.Width)
4907   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4908 }
4909 
4910 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4911                                    OptimizationRemarkEmitter *ORE,
4912                                    Loop *TheLoop) {
4913   if (InvalidCosts.empty())
4914     return;
4915 
4916   // Emit a report of VFs with invalid costs in the loop.
4917 
4918   // Group the remarks per instruction, keeping the instruction order from
4919   // InvalidCosts.
4920   std::map<Instruction *, unsigned> Numbering;
4921   unsigned I = 0;
4922   for (auto &Pair : InvalidCosts)
4923     if (!Numbering.count(Pair.first))
4924       Numbering[Pair.first] = I++;
4925 
4926   // Sort the list, first on instruction(number) then on VF.
4927   sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4928     if (Numbering[A.first] != Numbering[B.first])
4929       return Numbering[A.first] < Numbering[B.first];
4930     ElementCountComparator ECC;
4931     return ECC(A.second, B.second);
4932   });
4933 
4934   // For a list of ordered instruction-vf pairs:
4935   //   [(load, vf1), (load, vf2), (store, vf1)]
4936   // Group the instructions together to emit separate remarks for:
4937   //   load  (vf1, vf2)
4938   //   store (vf1)
4939   auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4940   auto Subset = ArrayRef<InstructionVFPair>();
4941   do {
4942     if (Subset.empty())
4943       Subset = Tail.take_front(1);
4944 
4945     Instruction *I = Subset.front().first;
4946 
4947     // If the next instruction is different, or if there are no other pairs,
4948     // emit a remark for the collated subset. e.g.
4949     //   [(load, vf1), (load, vf2))]
4950     // to emit:
4951     //  remark: invalid costs for 'load' at VF=(vf, vf2)
4952     if (Subset == Tail || Tail[Subset.size()].first != I) {
4953       std::string OutString;
4954       raw_string_ostream OS(OutString);
4955       assert(!Subset.empty() && "Unexpected empty range");
4956       OS << "Instruction with invalid costs prevented vectorization at VF=(";
4957       for (const auto &Pair : Subset)
4958         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4959       OS << "):";
4960       if (auto *CI = dyn_cast<CallInst>(I))
4961         OS << " call to " << CI->getCalledFunction()->getName();
4962       else
4963         OS << " " << I->getOpcodeName();
4964       OS.flush();
4965       reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4966       Tail = Tail.drop_front(Subset.size());
4967       Subset = {};
4968     } else
4969       // Grow the subset by one element
4970       Subset = Tail.take_front(Subset.size() + 1);
4971   } while (!Tail.empty());
4972 }
4973 
4974 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
4975     const ElementCountSet &VFCandidates) {
4976   InstructionCost ExpectedCost =
4977       CM.expectedCost(ElementCount::getFixed(1)).first;
4978   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4979   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4980   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
4981          "Expected Scalar VF to be a candidate");
4982 
4983   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4984                                        ExpectedCost);
4985   VectorizationFactor ChosenFactor = ScalarCost;
4986 
4987   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4988   if (ForceVectorization && VFCandidates.size() > 1) {
4989     // Ignore scalar width, because the user explicitly wants vectorization.
4990     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4991     // evaluation.
4992     ChosenFactor.Cost = InstructionCost::getMax();
4993   }
4994 
4995   SmallVector<InstructionVFPair> InvalidCosts;
4996   for (const auto &i : VFCandidates) {
4997     // The cost for scalar VF=1 is already calculated, so ignore it.
4998     if (i.isScalar())
4999       continue;
5000 
5001     LoopVectorizationCostModel::VectorizationCostTy C =
5002         CM.expectedCost(i, &InvalidCosts);
5003     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5004 
5005 #ifndef NDEBUG
5006     unsigned AssumedMinimumVscale =
5007         getVScaleForTuning(OrigLoop, TTI).value_or(1);
5008     unsigned Width =
5009         Candidate.Width.isScalable()
5010             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5011             : Candidate.Width.getFixedValue();
5012     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5013                       << " costs: " << (Candidate.Cost / Width));
5014     if (i.isScalable())
5015       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5016                         << AssumedMinimumVscale << ")");
5017     LLVM_DEBUG(dbgs() << ".\n");
5018 #endif
5019 
5020     if (!C.second && !ForceVectorization) {
5021       LLVM_DEBUG(
5022           dbgs() << "LV: Not considering vector loop of width " << i
5023                  << " because it will not generate any vector instructions.\n");
5024       continue;
5025     }
5026 
5027     // If profitable add it to ProfitableVF list.
5028     if (isMoreProfitable(Candidate, ScalarCost))
5029       ProfitableVFs.push_back(Candidate);
5030 
5031     if (isMoreProfitable(Candidate, ChosenFactor))
5032       ChosenFactor = Candidate;
5033   }
5034 
5035   emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5036 
5037   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5038     reportVectorizationFailure(
5039         "There are conditional stores.",
5040         "store that is conditionally executed prevents vectorization",
5041         "ConditionalStore", ORE, OrigLoop);
5042     ChosenFactor = ScalarCost;
5043   }
5044 
5045   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5046                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5047              << "LV: Vectorization seems to be not beneficial, "
5048              << "but was forced by a user.\n");
5049   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5050   return ChosenFactor;
5051 }
5052 
5053 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5054     ElementCount VF) const {
5055   // Cross iteration phis such as reductions need special handling and are
5056   // currently unsupported.
5057   if (any_of(OrigLoop->getHeader()->phis(),
5058              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5059     return false;
5060 
5061   // Phis with uses outside of the loop require special handling and are
5062   // currently unsupported.
5063   for (const auto &Entry : Legal->getInductionVars()) {
5064     // Look for uses of the value of the induction at the last iteration.
5065     Value *PostInc =
5066         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5067     for (User *U : PostInc->users())
5068       if (!OrigLoop->contains(cast<Instruction>(U)))
5069         return false;
5070     // Look for uses of penultimate value of the induction.
5071     for (User *U : Entry.first->users())
5072       if (!OrigLoop->contains(cast<Instruction>(U)))
5073         return false;
5074   }
5075 
5076   // Epilogue vectorization code has not been auditted to ensure it handles
5077   // non-latch exits properly.  It may be fine, but it needs auditted and
5078   // tested.
5079   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5080     return false;
5081 
5082   return true;
5083 }
5084 
5085 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5086     const ElementCount VF) const {
5087   // FIXME: We need a much better cost-model to take different parameters such
5088   // as register pressure, code size increase and cost of extra branches into
5089   // account. For now we apply a very crude heuristic and only consider loops
5090   // with vectorization factors larger than a certain value.
5091 
5092   // Allow the target to opt out entirely.
5093   if (!TTI.preferEpilogueVectorization())
5094     return false;
5095 
5096   // We also consider epilogue vectorization unprofitable for targets that don't
5097   // consider interleaving beneficial (eg. MVE).
5098   if (TTI.getMaxInterleaveFactor(VF) <= 1)
5099     return false;
5100 
5101   unsigned Multiplier = 1;
5102   if (VF.isScalable())
5103     Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5104   if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5105     return true;
5106   return false;
5107 }
5108 
5109 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5110     const ElementCount MainLoopVF, unsigned IC) {
5111   VectorizationFactor Result = VectorizationFactor::Disabled();
5112   if (!EnableEpilogueVectorization) {
5113     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5114     return Result;
5115   }
5116 
5117   if (!CM.isScalarEpilogueAllowed()) {
5118     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5119                          "epilogue is allowed.\n");
5120     return Result;
5121   }
5122 
5123   // Not really a cost consideration, but check for unsupported cases here to
5124   // simplify the logic.
5125   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5126     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5127                          "is not a supported candidate.\n");
5128     return Result;
5129   }
5130 
5131   if (EpilogueVectorizationForceVF > 1) {
5132     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5133     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5134     if (hasPlanWithVF(ForcedEC))
5135       return {ForcedEC, 0, 0};
5136     else {
5137       LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5138                            "viable.\n");
5139       return Result;
5140     }
5141   }
5142 
5143   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5144       OrigLoop->getHeader()->getParent()->hasMinSize()) {
5145     LLVM_DEBUG(
5146         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5147     return Result;
5148   }
5149 
5150   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5151     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5152                          "this loop\n");
5153     return Result;
5154   }
5155 
5156   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5157   // the main loop handles 8 lanes per iteration. We could still benefit from
5158   // vectorizing the epilogue loop with VF=4.
5159   ElementCount EstimatedRuntimeVF = MainLoopVF;
5160   if (MainLoopVF.isScalable()) {
5161     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5162     if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5163       EstimatedRuntimeVF *= *VScale;
5164   }
5165 
5166   ScalarEvolution &SE = *PSE.getSE();
5167   Type *TCType = Legal->getWidestInductionType();
5168   const SCEV *RemainingIterations = nullptr;
5169   for (auto &NextVF : ProfitableVFs) {
5170     // Skip candidate VFs without a corresponding VPlan.
5171     if (!hasPlanWithVF(NextVF.Width))
5172       continue;
5173 
5174     // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5175     // vectors) or the VF of the main loop (fixed vectors).
5176     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5177          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5178         ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5179       continue;
5180 
5181     // If NextVF is greater than the number of remaining iterations, the
5182     // epilogue loop would be dead. Skip such factors.
5183     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5184       // TODO: extend to support scalable VFs.
5185       if (!RemainingIterations) {
5186         const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5187         RemainingIterations = SE.getURemExpr(
5188             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5189       }
5190       if (SE.isKnownPredicate(
5191               CmpInst::ICMP_UGT,
5192               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5193               RemainingIterations))
5194         continue;
5195     }
5196 
5197     if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5198       Result = NextVF;
5199   }
5200 
5201   if (Result != VectorizationFactor::Disabled())
5202     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5203                       << Result.Width << "\n");
5204   return Result;
5205 }
5206 
5207 std::pair<unsigned, unsigned>
5208 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5209   unsigned MinWidth = -1U;
5210   unsigned MaxWidth = 8;
5211   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5212   // For in-loop reductions, no element types are added to ElementTypesInLoop
5213   // if there are no loads/stores in the loop. In this case, check through the
5214   // reduction variables to determine the maximum width.
5215   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5216     // Reset MaxWidth so that we can find the smallest type used by recurrences
5217     // in the loop.
5218     MaxWidth = -1U;
5219     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5220       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5221       // When finding the min width used by the recurrence we need to account
5222       // for casts on the input operands of the recurrence.
5223       MaxWidth = std::min<unsigned>(
5224           MaxWidth, std::min<unsigned>(
5225                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5226                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5227     }
5228   } else {
5229     for (Type *T : ElementTypesInLoop) {
5230       MinWidth = std::min<unsigned>(
5231           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5232       MaxWidth = std::max<unsigned>(
5233           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5234     }
5235   }
5236   return {MinWidth, MaxWidth};
5237 }
5238 
5239 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5240   ElementTypesInLoop.clear();
5241   // For each block.
5242   for (BasicBlock *BB : TheLoop->blocks()) {
5243     // For each instruction in the loop.
5244     for (Instruction &I : BB->instructionsWithoutDebug()) {
5245       Type *T = I.getType();
5246 
5247       // Skip ignored values.
5248       if (ValuesToIgnore.count(&I))
5249         continue;
5250 
5251       // Only examine Loads, Stores and PHINodes.
5252       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5253         continue;
5254 
5255       // Examine PHI nodes that are reduction variables. Update the type to
5256       // account for the recurrence type.
5257       if (auto *PN = dyn_cast<PHINode>(&I)) {
5258         if (!Legal->isReductionVariable(PN))
5259           continue;
5260         const RecurrenceDescriptor &RdxDesc =
5261             Legal->getReductionVars().find(PN)->second;
5262         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5263             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5264                                       RdxDesc.getRecurrenceType(),
5265                                       TargetTransformInfo::ReductionFlags()))
5266           continue;
5267         T = RdxDesc.getRecurrenceType();
5268       }
5269 
5270       // Examine the stored values.
5271       if (auto *ST = dyn_cast<StoreInst>(&I))
5272         T = ST->getValueOperand()->getType();
5273 
5274       assert(T->isSized() &&
5275              "Expected the load/store/recurrence type to be sized");
5276 
5277       ElementTypesInLoop.insert(T);
5278     }
5279   }
5280 }
5281 
5282 unsigned
5283 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5284                                                   InstructionCost LoopCost) {
5285   // -- The interleave heuristics --
5286   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5287   // There are many micro-architectural considerations that we can't predict
5288   // at this level. For example, frontend pressure (on decode or fetch) due to
5289   // code size, or the number and capabilities of the execution ports.
5290   //
5291   // We use the following heuristics to select the interleave count:
5292   // 1. If the code has reductions, then we interleave to break the cross
5293   // iteration dependency.
5294   // 2. If the loop is really small, then we interleave to reduce the loop
5295   // overhead.
5296   // 3. We don't interleave if we think that we will spill registers to memory
5297   // due to the increased register pressure.
5298 
5299   if (!isScalarEpilogueAllowed())
5300     return 1;
5301 
5302   // We used the distance for the interleave count.
5303   if (!Legal->isSafeForAnyVectorWidth())
5304     return 1;
5305 
5306   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5307   const bool HasReductions = !Legal->getReductionVars().empty();
5308   // Do not interleave loops with a relatively small known or estimated trip
5309   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5310   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5311   // because with the above conditions interleaving can expose ILP and break
5312   // cross iteration dependences for reductions.
5313   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5314       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5315     return 1;
5316 
5317   // If we did not calculate the cost for VF (because the user selected the VF)
5318   // then we calculate the cost of VF here.
5319   if (LoopCost == 0) {
5320     LoopCost = expectedCost(VF).first;
5321     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5322 
5323     // Loop body is free and there is no need for interleaving.
5324     if (LoopCost == 0)
5325       return 1;
5326   }
5327 
5328   RegisterUsage R = calculateRegisterUsage({VF})[0];
5329   // We divide by these constants so assume that we have at least one
5330   // instruction that uses at least one register.
5331   for (auto& pair : R.MaxLocalUsers) {
5332     pair.second = std::max(pair.second, 1U);
5333   }
5334 
5335   // We calculate the interleave count using the following formula.
5336   // Subtract the number of loop invariants from the number of available
5337   // registers. These registers are used by all of the interleaved instances.
5338   // Next, divide the remaining registers by the number of registers that is
5339   // required by the loop, in order to estimate how many parallel instances
5340   // fit without causing spills. All of this is rounded down if necessary to be
5341   // a power of two. We want power of two interleave count to simplify any
5342   // addressing operations or alignment considerations.
5343   // We also want power of two interleave counts to ensure that the induction
5344   // variable of the vector loop wraps to zero, when tail is folded by masking;
5345   // this currently happens when OptForSize, in which case IC is set to 1 above.
5346   unsigned IC = UINT_MAX;
5347 
5348   for (auto& pair : R.MaxLocalUsers) {
5349     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5350     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5351                       << " registers of "
5352                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5353     if (VF.isScalar()) {
5354       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5355         TargetNumRegisters = ForceTargetNumScalarRegs;
5356     } else {
5357       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5358         TargetNumRegisters = ForceTargetNumVectorRegs;
5359     }
5360     unsigned MaxLocalUsers = pair.second;
5361     unsigned LoopInvariantRegs = 0;
5362     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5363       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5364 
5365     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5366                                      MaxLocalUsers);
5367     // Don't count the induction variable as interleaved.
5368     if (EnableIndVarRegisterHeur) {
5369       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5370                               std::max(1U, (MaxLocalUsers - 1)));
5371     }
5372 
5373     IC = std::min(IC, TmpIC);
5374   }
5375 
5376   // Clamp the interleave ranges to reasonable counts.
5377   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5378 
5379   // Check if the user has overridden the max.
5380   if (VF.isScalar()) {
5381     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5382       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5383   } else {
5384     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5385       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5386   }
5387 
5388   unsigned EstimatedVF = VF.getKnownMinValue();
5389   if (VF.isScalable()) {
5390     if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5391       EstimatedVF *= *VScale;
5392   }
5393   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5394 
5395   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5396   if (KnownTC) {
5397     // If trip count is known we select between two prospective ICs, where
5398     // 1) the aggressive IC is capped by the trip count divided by VF
5399     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5400     // The final IC is selected in a way that the epilogue loop trip count is
5401     // minimized while maximizing the IC itself, so that we either run the
5402     // vector loop at least once if it generates a small epilogue loop, or else
5403     // we run the vector loop at least twice.
5404 
5405     unsigned InterleaveCountUB = bit_floor(
5406         std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5407     unsigned InterleaveCountLB = bit_floor(std::max(
5408         1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5409     MaxInterleaveCount = InterleaveCountLB;
5410 
5411     if (InterleaveCountUB != InterleaveCountLB) {
5412       unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5413       unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5414       // If both produce same scalar tail, maximize the IC to do the same work
5415       // in fewer vector loop iterations
5416       if (TailTripCountUB == TailTripCountLB)
5417         MaxInterleaveCount = InterleaveCountUB;
5418     }
5419   } else if (BestKnownTC) {
5420     // If trip count is an estimated compile time constant, limit the
5421     // IC to be capped by the trip count divided by VF * 2, such that the vector
5422     // loop runs at least twice to make interleaving seem profitable when there
5423     // is an epilogue loop present. Since exact Trip count is not known we
5424     // choose to be conservative in our IC estimate.
5425     MaxInterleaveCount = bit_floor(std::max(
5426         1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5427   }
5428 
5429   assert(MaxInterleaveCount > 0 &&
5430          "Maximum interleave count must be greater than 0");
5431 
5432   // Clamp the calculated IC to be between the 1 and the max interleave count
5433   // that the target and trip count allows.
5434   if (IC > MaxInterleaveCount)
5435     IC = MaxInterleaveCount;
5436   else
5437     // Make sure IC is greater than 0.
5438     IC = std::max(1u, IC);
5439 
5440   assert(IC > 0 && "Interleave count must be greater than 0.");
5441 
5442   // Interleave if we vectorized this loop and there is a reduction that could
5443   // benefit from interleaving.
5444   if (VF.isVector() && HasReductions) {
5445     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5446     return IC;
5447   }
5448 
5449   // For any scalar loop that either requires runtime checks or predication we
5450   // are better off leaving this to the unroller. Note that if we've already
5451   // vectorized the loop we will have done the runtime check and so interleaving
5452   // won't require further checks.
5453   bool ScalarInterleavingRequiresPredication =
5454       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5455          return Legal->blockNeedsPredication(BB);
5456        }));
5457   bool ScalarInterleavingRequiresRuntimePointerCheck =
5458       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5459 
5460   // We want to interleave small loops in order to reduce the loop overhead and
5461   // potentially expose ILP opportunities.
5462   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5463                     << "LV: IC is " << IC << '\n'
5464                     << "LV: VF is " << VF << '\n');
5465   const bool AggressivelyInterleaveReductions =
5466       TTI.enableAggressiveInterleaving(HasReductions);
5467   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5468       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5469     // We assume that the cost overhead is 1 and we use the cost model
5470     // to estimate the cost of the loop and interleave until the cost of the
5471     // loop overhead is about 5% of the cost of the loop.
5472     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5473                                         SmallLoopCost / *LoopCost.getValue()));
5474 
5475     // Interleave until store/load ports (estimated by max interleave count) are
5476     // saturated.
5477     unsigned NumStores = Legal->getNumStores();
5478     unsigned NumLoads = Legal->getNumLoads();
5479     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5480     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5481 
5482     // There is little point in interleaving for reductions containing selects
5483     // and compares when VF=1 since it may just create more overhead than it's
5484     // worth for loops with small trip counts. This is because we still have to
5485     // do the final reduction after the loop.
5486     bool HasSelectCmpReductions =
5487         HasReductions &&
5488         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5489           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5490           return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5491               RdxDesc.getRecurrenceKind());
5492         });
5493     if (HasSelectCmpReductions) {
5494       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5495       return 1;
5496     }
5497 
5498     // If we have a scalar reduction (vector reductions are already dealt with
5499     // by this point), we can increase the critical path length if the loop
5500     // we're interleaving is inside another loop. For tree-wise reductions
5501     // set the limit to 2, and for ordered reductions it's best to disable
5502     // interleaving entirely.
5503     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5504       bool HasOrderedReductions =
5505           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5506             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5507             return RdxDesc.isOrdered();
5508           });
5509       if (HasOrderedReductions) {
5510         LLVM_DEBUG(
5511             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5512         return 1;
5513       }
5514 
5515       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5516       SmallIC = std::min(SmallIC, F);
5517       StoresIC = std::min(StoresIC, F);
5518       LoadsIC = std::min(LoadsIC, F);
5519     }
5520 
5521     if (EnableLoadStoreRuntimeInterleave &&
5522         std::max(StoresIC, LoadsIC) > SmallIC) {
5523       LLVM_DEBUG(
5524           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5525       return std::max(StoresIC, LoadsIC);
5526     }
5527 
5528     // If there are scalar reductions and TTI has enabled aggressive
5529     // interleaving for reductions, we will interleave to expose ILP.
5530     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5531         AggressivelyInterleaveReductions) {
5532       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5533       // Interleave no less than SmallIC but not as aggressive as the normal IC
5534       // to satisfy the rare situation when resources are too limited.
5535       return std::max(IC / 2, SmallIC);
5536     } else {
5537       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5538       return SmallIC;
5539     }
5540   }
5541 
5542   // Interleave if this is a large loop (small loops are already dealt with by
5543   // this point) that could benefit from interleaving.
5544   if (AggressivelyInterleaveReductions) {
5545     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5546     return IC;
5547   }
5548 
5549   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5550   return 1;
5551 }
5552 
5553 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5554 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5555   // This function calculates the register usage by measuring the highest number
5556   // of values that are alive at a single location. Obviously, this is a very
5557   // rough estimation. We scan the loop in a topological order in order and
5558   // assign a number to each instruction. We use RPO to ensure that defs are
5559   // met before their users. We assume that each instruction that has in-loop
5560   // users starts an interval. We record every time that an in-loop value is
5561   // used, so we have a list of the first and last occurrences of each
5562   // instruction. Next, we transpose this data structure into a multi map that
5563   // holds the list of intervals that *end* at a specific location. This multi
5564   // map allows us to perform a linear search. We scan the instructions linearly
5565   // and record each time that a new interval starts, by placing it in a set.
5566   // If we find this value in the multi-map then we remove it from the set.
5567   // The max register usage is the maximum size of the set.
5568   // We also search for instructions that are defined outside the loop, but are
5569   // used inside the loop. We need this number separately from the max-interval
5570   // usage number because when we unroll, loop-invariant values do not take
5571   // more register.
5572   LoopBlocksDFS DFS(TheLoop);
5573   DFS.perform(LI);
5574 
5575   RegisterUsage RU;
5576 
5577   // Each 'key' in the map opens a new interval. The values
5578   // of the map are the index of the 'last seen' usage of the
5579   // instruction that is the key.
5580   using IntervalMap = DenseMap<Instruction *, unsigned>;
5581 
5582   // Maps instruction to its index.
5583   SmallVector<Instruction *, 64> IdxToInstr;
5584   // Marks the end of each interval.
5585   IntervalMap EndPoint;
5586   // Saves the list of instruction indices that are used in the loop.
5587   SmallPtrSet<Instruction *, 8> Ends;
5588   // Saves the list of values that are used in the loop but are defined outside
5589   // the loop (not including non-instruction values such as arguments and
5590   // constants).
5591   SmallSetVector<Instruction *, 8> LoopInvariants;
5592 
5593   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5594     for (Instruction &I : BB->instructionsWithoutDebug()) {
5595       IdxToInstr.push_back(&I);
5596 
5597       // Save the end location of each USE.
5598       for (Value *U : I.operands()) {
5599         auto *Instr = dyn_cast<Instruction>(U);
5600 
5601         // Ignore non-instruction values such as arguments, constants, etc.
5602         // FIXME: Might need some motivation why these values are ignored. If
5603         // for example an argument is used inside the loop it will increase the
5604         // register pressure (so shouldn't we add it to LoopInvariants).
5605         if (!Instr)
5606           continue;
5607 
5608         // If this instruction is outside the loop then record it and continue.
5609         if (!TheLoop->contains(Instr)) {
5610           LoopInvariants.insert(Instr);
5611           continue;
5612         }
5613 
5614         // Overwrite previous end points.
5615         EndPoint[Instr] = IdxToInstr.size();
5616         Ends.insert(Instr);
5617       }
5618     }
5619   }
5620 
5621   // Saves the list of intervals that end with the index in 'key'.
5622   using InstrList = SmallVector<Instruction *, 2>;
5623   DenseMap<unsigned, InstrList> TransposeEnds;
5624 
5625   // Transpose the EndPoints to a list of values that end at each index.
5626   for (auto &Interval : EndPoint)
5627     TransposeEnds[Interval.second].push_back(Interval.first);
5628 
5629   SmallPtrSet<Instruction *, 8> OpenIntervals;
5630   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5631   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5632 
5633   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5634 
5635   const auto &TTICapture = TTI;
5636   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5637     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5638       return 0;
5639     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5640   };
5641 
5642   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5643     Instruction *I = IdxToInstr[i];
5644 
5645     // Remove all of the instructions that end at this location.
5646     InstrList &List = TransposeEnds[i];
5647     for (Instruction *ToRemove : List)
5648       OpenIntervals.erase(ToRemove);
5649 
5650     // Ignore instructions that are never used within the loop.
5651     if (!Ends.count(I))
5652       continue;
5653 
5654     // Skip ignored values.
5655     if (ValuesToIgnore.count(I))
5656       continue;
5657 
5658     collectInLoopReductions();
5659 
5660     // For each VF find the maximum usage of registers.
5661     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5662       // Count the number of registers used, per register class, given all open
5663       // intervals.
5664       // Note that elements in this SmallMapVector will be default constructed
5665       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5666       // there is no previous entry for ClassID.
5667       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5668 
5669       if (VFs[j].isScalar()) {
5670         for (auto *Inst : OpenIntervals) {
5671           unsigned ClassID =
5672               TTI.getRegisterClassForType(false, Inst->getType());
5673           // FIXME: The target might use more than one register for the type
5674           // even in the scalar case.
5675           RegUsage[ClassID] += 1;
5676         }
5677       } else {
5678         collectUniformsAndScalars(VFs[j]);
5679         for (auto *Inst : OpenIntervals) {
5680           // Skip ignored values for VF > 1.
5681           if (VecValuesToIgnore.count(Inst))
5682             continue;
5683           if (isScalarAfterVectorization(Inst, VFs[j])) {
5684             unsigned ClassID =
5685                 TTI.getRegisterClassForType(false, Inst->getType());
5686             // FIXME: The target might use more than one register for the type
5687             // even in the scalar case.
5688             RegUsage[ClassID] += 1;
5689           } else {
5690             unsigned ClassID =
5691                 TTI.getRegisterClassForType(true, Inst->getType());
5692             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5693           }
5694         }
5695       }
5696 
5697       for (auto& pair : RegUsage) {
5698         auto &Entry = MaxUsages[j][pair.first];
5699         Entry = std::max(Entry, pair.second);
5700       }
5701     }
5702 
5703     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5704                       << OpenIntervals.size() << '\n');
5705 
5706     // Add the current instruction to the list of open intervals.
5707     OpenIntervals.insert(I);
5708   }
5709 
5710   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5711     // Note that elements in this SmallMapVector will be default constructed
5712     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5713     // there is no previous entry for ClassID.
5714     SmallMapVector<unsigned, unsigned, 4> Invariant;
5715 
5716     for (auto *Inst : LoopInvariants) {
5717       // FIXME: The target might use more than one register for the type
5718       // even in the scalar case.
5719       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5720         auto *I = cast<Instruction>(U);
5721         return TheLoop != LI->getLoopFor(I->getParent()) ||
5722                isScalarAfterVectorization(I, VFs[i]);
5723       });
5724 
5725       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5726       unsigned ClassID =
5727           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5728       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5729     }
5730 
5731     LLVM_DEBUG({
5732       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5733       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5734              << " item\n";
5735       for (const auto &pair : MaxUsages[i]) {
5736         dbgs() << "LV(REG): RegisterClass: "
5737                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5738                << " registers\n";
5739       }
5740       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5741              << " item\n";
5742       for (const auto &pair : Invariant) {
5743         dbgs() << "LV(REG): RegisterClass: "
5744                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5745                << " registers\n";
5746       }
5747     });
5748 
5749     RU.LoopInvariantRegs = Invariant;
5750     RU.MaxLocalUsers = MaxUsages[i];
5751     RUs[i] = RU;
5752   }
5753 
5754   return RUs;
5755 }
5756 
5757 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5758                                                            ElementCount VF) {
5759   // TODO: Cost model for emulated masked load/store is completely
5760   // broken. This hack guides the cost model to use an artificially
5761   // high enough value to practically disable vectorization with such
5762   // operations, except where previously deployed legality hack allowed
5763   // using very low cost values. This is to avoid regressions coming simply
5764   // from moving "masked load/store" check from legality to cost model.
5765   // Masked Load/Gather emulation was previously never allowed.
5766   // Limited number of Masked Store/Scatter emulation was allowed.
5767   assert((isPredicatedInst(I)) &&
5768          "Expecting a scalar emulated instruction");
5769   return isa<LoadInst>(I) ||
5770          (isa<StoreInst>(I) &&
5771           NumPredStores > NumberOfStoresToPredicate);
5772 }
5773 
5774 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5775   // If we aren't vectorizing the loop, or if we've already collected the
5776   // instructions to scalarize, there's nothing to do. Collection may already
5777   // have occurred if we have a user-selected VF and are now computing the
5778   // expected cost for interleaving.
5779   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5780     return;
5781 
5782   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5783   // not profitable to scalarize any instructions, the presence of VF in the
5784   // map will indicate that we've analyzed it already.
5785   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5786 
5787   PredicatedBBsAfterVectorization[VF].clear();
5788 
5789   // Find all the instructions that are scalar with predication in the loop and
5790   // determine if it would be better to not if-convert the blocks they are in.
5791   // If so, we also record the instructions to scalarize.
5792   for (BasicBlock *BB : TheLoop->blocks()) {
5793     if (!blockNeedsPredicationForAnyReason(BB))
5794       continue;
5795     for (Instruction &I : *BB)
5796       if (isScalarWithPredication(&I, VF)) {
5797         ScalarCostsTy ScalarCosts;
5798         // Do not apply discount if scalable, because that would lead to
5799         // invalid scalarization costs.
5800         // Do not apply discount logic if hacked cost is needed
5801         // for emulated masked memrefs.
5802         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
5803             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5804           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5805         // Remember that BB will remain after vectorization.
5806         PredicatedBBsAfterVectorization[VF].insert(BB);
5807       }
5808   }
5809 }
5810 
5811 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5812     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5813   assert(!isUniformAfterVectorization(PredInst, VF) &&
5814          "Instruction marked uniform-after-vectorization will be predicated");
5815 
5816   // Initialize the discount to zero, meaning that the scalar version and the
5817   // vector version cost the same.
5818   InstructionCost Discount = 0;
5819 
5820   // Holds instructions to analyze. The instructions we visit are mapped in
5821   // ScalarCosts. Those instructions are the ones that would be scalarized if
5822   // we find that the scalar version costs less.
5823   SmallVector<Instruction *, 8> Worklist;
5824 
5825   // Returns true if the given instruction can be scalarized.
5826   auto canBeScalarized = [&](Instruction *I) -> bool {
5827     // We only attempt to scalarize instructions forming a single-use chain
5828     // from the original predicated block that would otherwise be vectorized.
5829     // Although not strictly necessary, we give up on instructions we know will
5830     // already be scalar to avoid traversing chains that are unlikely to be
5831     // beneficial.
5832     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5833         isScalarAfterVectorization(I, VF))
5834       return false;
5835 
5836     // If the instruction is scalar with predication, it will be analyzed
5837     // separately. We ignore it within the context of PredInst.
5838     if (isScalarWithPredication(I, VF))
5839       return false;
5840 
5841     // If any of the instruction's operands are uniform after vectorization,
5842     // the instruction cannot be scalarized. This prevents, for example, a
5843     // masked load from being scalarized.
5844     //
5845     // We assume we will only emit a value for lane zero of an instruction
5846     // marked uniform after vectorization, rather than VF identical values.
5847     // Thus, if we scalarize an instruction that uses a uniform, we would
5848     // create uses of values corresponding to the lanes we aren't emitting code
5849     // for. This behavior can be changed by allowing getScalarValue to clone
5850     // the lane zero values for uniforms rather than asserting.
5851     for (Use &U : I->operands())
5852       if (auto *J = dyn_cast<Instruction>(U.get()))
5853         if (isUniformAfterVectorization(J, VF))
5854           return false;
5855 
5856     // Otherwise, we can scalarize the instruction.
5857     return true;
5858   };
5859 
5860   // Compute the expected cost discount from scalarizing the entire expression
5861   // feeding the predicated instruction. We currently only consider expressions
5862   // that are single-use instruction chains.
5863   Worklist.push_back(PredInst);
5864   while (!Worklist.empty()) {
5865     Instruction *I = Worklist.pop_back_val();
5866 
5867     // If we've already analyzed the instruction, there's nothing to do.
5868     if (ScalarCosts.contains(I))
5869       continue;
5870 
5871     // Compute the cost of the vector instruction. Note that this cost already
5872     // includes the scalarization overhead of the predicated instruction.
5873     InstructionCost VectorCost = getInstructionCost(I, VF).first;
5874 
5875     // Compute the cost of the scalarized instruction. This cost is the cost of
5876     // the instruction as if it wasn't if-converted and instead remained in the
5877     // predicated block. We will scale this cost by block probability after
5878     // computing the scalarization overhead.
5879     InstructionCost ScalarCost =
5880         VF.getFixedValue() *
5881         getInstructionCost(I, ElementCount::getFixed(1)).first;
5882 
5883     // Compute the scalarization overhead of needed insertelement instructions
5884     // and phi nodes.
5885     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5886     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5887       ScalarCost += TTI.getScalarizationOverhead(
5888           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5889           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5890           /*Extract*/ false, CostKind);
5891       ScalarCost +=
5892           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5893     }
5894 
5895     // Compute the scalarization overhead of needed extractelement
5896     // instructions. For each of the instruction's operands, if the operand can
5897     // be scalarized, add it to the worklist; otherwise, account for the
5898     // overhead.
5899     for (Use &U : I->operands())
5900       if (auto *J = dyn_cast<Instruction>(U.get())) {
5901         assert(VectorType::isValidElementType(J->getType()) &&
5902                "Instruction has non-scalar type");
5903         if (canBeScalarized(J))
5904           Worklist.push_back(J);
5905         else if (needsExtract(J, VF)) {
5906           ScalarCost += TTI.getScalarizationOverhead(
5907               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5908               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5909               /*Extract*/ true, CostKind);
5910         }
5911       }
5912 
5913     // Scale the total scalar cost by block probability.
5914     ScalarCost /= getReciprocalPredBlockProb();
5915 
5916     // Compute the discount. A non-negative discount means the vector version
5917     // of the instruction costs more, and scalarizing would be beneficial.
5918     Discount += VectorCost - ScalarCost;
5919     ScalarCosts[I] = ScalarCost;
5920   }
5921 
5922   return Discount;
5923 }
5924 
5925 LoopVectorizationCostModel::VectorizationCostTy
5926 LoopVectorizationCostModel::expectedCost(
5927     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5928   VectorizationCostTy Cost;
5929 
5930   // For each block.
5931   for (BasicBlock *BB : TheLoop->blocks()) {
5932     VectorizationCostTy BlockCost;
5933 
5934     // For each instruction in the old loop.
5935     for (Instruction &I : BB->instructionsWithoutDebug()) {
5936       // Skip ignored values.
5937       if (ValuesToIgnore.count(&I) ||
5938           (VF.isVector() && VecValuesToIgnore.count(&I)))
5939         continue;
5940 
5941       VectorizationCostTy C = getInstructionCost(&I, VF);
5942 
5943       // Check if we should override the cost.
5944       if (C.first.isValid() &&
5945           ForceTargetInstructionCost.getNumOccurrences() > 0)
5946         C.first = InstructionCost(ForceTargetInstructionCost);
5947 
5948       // Keep a list of instructions with invalid costs.
5949       if (Invalid && !C.first.isValid())
5950         Invalid->emplace_back(&I, VF);
5951 
5952       BlockCost.first += C.first;
5953       BlockCost.second |= C.second;
5954       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5955                         << " for VF " << VF << " For instruction: " << I
5956                         << '\n');
5957     }
5958 
5959     // If we are vectorizing a predicated block, it will have been
5960     // if-converted. This means that the block's instructions (aside from
5961     // stores and instructions that may divide by zero) will now be
5962     // unconditionally executed. For the scalar case, we may not always execute
5963     // the predicated block, if it is an if-else block. Thus, scale the block's
5964     // cost by the probability of executing it. blockNeedsPredication from
5965     // Legal is used so as to not include all blocks in tail folded loops.
5966     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5967       BlockCost.first /= getReciprocalPredBlockProb();
5968 
5969     Cost.first += BlockCost.first;
5970     Cost.second |= BlockCost.second;
5971   }
5972 
5973   return Cost;
5974 }
5975 
5976 /// Gets Address Access SCEV after verifying that the access pattern
5977 /// is loop invariant except the induction variable dependence.
5978 ///
5979 /// This SCEV can be sent to the Target in order to estimate the address
5980 /// calculation cost.
5981 static const SCEV *getAddressAccessSCEV(
5982               Value *Ptr,
5983               LoopVectorizationLegality *Legal,
5984               PredicatedScalarEvolution &PSE,
5985               const Loop *TheLoop) {
5986 
5987   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5988   if (!Gep)
5989     return nullptr;
5990 
5991   // We are looking for a gep with all loop invariant indices except for one
5992   // which should be an induction variable.
5993   auto SE = PSE.getSE();
5994   unsigned NumOperands = Gep->getNumOperands();
5995   for (unsigned i = 1; i < NumOperands; ++i) {
5996     Value *Opd = Gep->getOperand(i);
5997     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5998         !Legal->isInductionVariable(Opd))
5999       return nullptr;
6000   }
6001 
6002   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6003   return PSE.getSCEV(Ptr);
6004 }
6005 
6006 InstructionCost
6007 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6008                                                         ElementCount VF) {
6009   assert(VF.isVector() &&
6010          "Scalarization cost of instruction implies vectorization.");
6011   if (VF.isScalable())
6012     return InstructionCost::getInvalid();
6013 
6014   Type *ValTy = getLoadStoreType(I);
6015   auto SE = PSE.getSE();
6016 
6017   unsigned AS = getLoadStoreAddressSpace(I);
6018   Value *Ptr = getLoadStorePointerOperand(I);
6019   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6020   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6021   //       that it is being called from this specific place.
6022 
6023   // Figure out whether the access is strided and get the stride value
6024   // if it's known in compile time
6025   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6026 
6027   // Get the cost of the scalar memory instruction and address computation.
6028   InstructionCost Cost =
6029       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6030 
6031   // Don't pass *I here, since it is scalar but will actually be part of a
6032   // vectorized loop where the user of it is a vectorized instruction.
6033   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6034   const Align Alignment = getLoadStoreAlignment(I);
6035   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6036                                                       ValTy->getScalarType(),
6037                                                       Alignment, AS, CostKind);
6038 
6039   // Get the overhead of the extractelement and insertelement instructions
6040   // we might create due to scalarization.
6041   Cost += getScalarizationOverhead(I, VF, CostKind);
6042 
6043   // If we have a predicated load/store, it will need extra i1 extracts and
6044   // conditional branches, but may not be executed for each vector lane. Scale
6045   // the cost by the probability of executing the predicated block.
6046   if (isPredicatedInst(I)) {
6047     Cost /= getReciprocalPredBlockProb();
6048 
6049     // Add the cost of an i1 extract and a branch
6050     auto *Vec_i1Ty =
6051         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6052     Cost += TTI.getScalarizationOverhead(
6053         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6054         /*Insert=*/false, /*Extract=*/true, CostKind);
6055     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6056 
6057     if (useEmulatedMaskMemRefHack(I, VF))
6058       // Artificially setting to a high enough value to practically disable
6059       // vectorization with such operations.
6060       Cost = 3000000;
6061   }
6062 
6063   return Cost;
6064 }
6065 
6066 InstructionCost
6067 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6068                                                     ElementCount VF) {
6069   Type *ValTy = getLoadStoreType(I);
6070   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6071   Value *Ptr = getLoadStorePointerOperand(I);
6072   unsigned AS = getLoadStoreAddressSpace(I);
6073   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6074   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6075 
6076   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6077          "Stride should be 1 or -1 for consecutive memory access");
6078   const Align Alignment = getLoadStoreAlignment(I);
6079   InstructionCost Cost = 0;
6080   if (Legal->isMaskRequired(I)) {
6081     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6082                                       CostKind);
6083   } else {
6084     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6085     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6086                                 CostKind, OpInfo, I);
6087   }
6088 
6089   bool Reverse = ConsecutiveStride < 0;
6090   if (Reverse)
6091     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6092                                std::nullopt, CostKind, 0);
6093   return Cost;
6094 }
6095 
6096 InstructionCost
6097 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6098                                                 ElementCount VF) {
6099   assert(Legal->isUniformMemOp(*I, VF));
6100 
6101   Type *ValTy = getLoadStoreType(I);
6102   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6103   const Align Alignment = getLoadStoreAlignment(I);
6104   unsigned AS = getLoadStoreAddressSpace(I);
6105   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6106   if (isa<LoadInst>(I)) {
6107     return TTI.getAddressComputationCost(ValTy) +
6108            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6109                                CostKind) +
6110            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6111   }
6112   StoreInst *SI = cast<StoreInst>(I);
6113 
6114   bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6115   return TTI.getAddressComputationCost(ValTy) +
6116          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6117                              CostKind) +
6118          (isLoopInvariantStoreValue
6119               ? 0
6120               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6121                                        CostKind, VF.getKnownMinValue() - 1));
6122 }
6123 
6124 InstructionCost
6125 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6126                                                  ElementCount VF) {
6127   Type *ValTy = getLoadStoreType(I);
6128   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6129   const Align Alignment = getLoadStoreAlignment(I);
6130   const Value *Ptr = getLoadStorePointerOperand(I);
6131 
6132   return TTI.getAddressComputationCost(VectorTy) +
6133          TTI.getGatherScatterOpCost(
6134              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6135              TargetTransformInfo::TCK_RecipThroughput, I);
6136 }
6137 
6138 InstructionCost
6139 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6140                                                    ElementCount VF) {
6141   Type *ValTy = getLoadStoreType(I);
6142   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6143   unsigned AS = getLoadStoreAddressSpace(I);
6144   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6145 
6146   auto Group = getInterleavedAccessGroup(I);
6147   assert(Group && "Fail to get an interleaved access group.");
6148 
6149   unsigned InterleaveFactor = Group->getFactor();
6150   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6151 
6152   // Holds the indices of existing members in the interleaved group.
6153   SmallVector<unsigned, 4> Indices;
6154   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6155     if (Group->getMember(IF))
6156       Indices.push_back(IF);
6157 
6158   // Calculate the cost of the whole interleaved group.
6159   bool UseMaskForGaps =
6160       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6161       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6162   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6163       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6164       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6165 
6166   if (Group->isReverse()) {
6167     // TODO: Add support for reversed masked interleaved access.
6168     assert(!Legal->isMaskRequired(I) &&
6169            "Reverse masked interleaved access not supported.");
6170     Cost += Group->getNumMembers() *
6171             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6172                                std::nullopt, CostKind, 0);
6173   }
6174   return Cost;
6175 }
6176 
6177 std::optional<InstructionCost>
6178 LoopVectorizationCostModel::getReductionPatternCost(
6179     Instruction *I, ElementCount VF, Type *Ty,
6180     TTI::TargetCostKind CostKind) const {
6181   using namespace llvm::PatternMatch;
6182   // Early exit for no inloop reductions
6183   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6184     return std::nullopt;
6185   auto *VectorTy = cast<VectorType>(Ty);
6186 
6187   // We are looking for a pattern of, and finding the minimal acceptable cost:
6188   //  reduce(mul(ext(A), ext(B))) or
6189   //  reduce(mul(A, B)) or
6190   //  reduce(ext(A)) or
6191   //  reduce(A).
6192   // The basic idea is that we walk down the tree to do that, finding the root
6193   // reduction instruction in InLoopReductionImmediateChains. From there we find
6194   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6195   // of the components. If the reduction cost is lower then we return it for the
6196   // reduction instruction and 0 for the other instructions in the pattern. If
6197   // it is not we return an invalid cost specifying the orignal cost method
6198   // should be used.
6199   Instruction *RetI = I;
6200   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6201     if (!RetI->hasOneUser())
6202       return std::nullopt;
6203     RetI = RetI->user_back();
6204   }
6205 
6206   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6207       RetI->user_back()->getOpcode() == Instruction::Add) {
6208     RetI = RetI->user_back();
6209   }
6210 
6211   // Test if the found instruction is a reduction, and if not return an invalid
6212   // cost specifying the parent to use the original cost modelling.
6213   if (!InLoopReductionImmediateChains.count(RetI))
6214     return std::nullopt;
6215 
6216   // Find the reduction this chain is a part of and calculate the basic cost of
6217   // the reduction on its own.
6218   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6219   Instruction *ReductionPhi = LastChain;
6220   while (!isa<PHINode>(ReductionPhi))
6221     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6222 
6223   const RecurrenceDescriptor &RdxDesc =
6224       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6225 
6226   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6227       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6228 
6229   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6230   // normal fmul instruction to the cost of the fadd reduction.
6231   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6232     BaseCost +=
6233         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6234 
6235   // If we're using ordered reductions then we can just return the base cost
6236   // here, since getArithmeticReductionCost calculates the full ordered
6237   // reduction cost when FP reassociation is not allowed.
6238   if (useOrderedReductions(RdxDesc))
6239     return BaseCost;
6240 
6241   // Get the operand that was not the reduction chain and match it to one of the
6242   // patterns, returning the better cost if it is found.
6243   Instruction *RedOp = RetI->getOperand(1) == LastChain
6244                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6245                            : dyn_cast<Instruction>(RetI->getOperand(1));
6246 
6247   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6248 
6249   Instruction *Op0, *Op1;
6250   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6251       match(RedOp,
6252             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6253       match(Op0, m_ZExtOrSExt(m_Value())) &&
6254       Op0->getOpcode() == Op1->getOpcode() &&
6255       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6256       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6257       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6258 
6259     // Matched reduce.add(ext(mul(ext(A), ext(B)))
6260     // Note that the extend opcodes need to all match, or if A==B they will have
6261     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6262     // which is equally fine.
6263     bool IsUnsigned = isa<ZExtInst>(Op0);
6264     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6265     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6266 
6267     InstructionCost ExtCost =
6268         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6269                              TTI::CastContextHint::None, CostKind, Op0);
6270     InstructionCost MulCost =
6271         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6272     InstructionCost Ext2Cost =
6273         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6274                              TTI::CastContextHint::None, CostKind, RedOp);
6275 
6276     InstructionCost RedCost = TTI.getMulAccReductionCost(
6277         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6278 
6279     if (RedCost.isValid() &&
6280         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6281       return I == RetI ? RedCost : 0;
6282   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6283              !TheLoop->isLoopInvariant(RedOp)) {
6284     // Matched reduce(ext(A))
6285     bool IsUnsigned = isa<ZExtInst>(RedOp);
6286     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6287     InstructionCost RedCost = TTI.getExtendedReductionCost(
6288         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6289         RdxDesc.getFastMathFlags(), CostKind);
6290 
6291     InstructionCost ExtCost =
6292         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6293                              TTI::CastContextHint::None, CostKind, RedOp);
6294     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6295       return I == RetI ? RedCost : 0;
6296   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6297              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6298     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6299         Op0->getOpcode() == Op1->getOpcode() &&
6300         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6301       bool IsUnsigned = isa<ZExtInst>(Op0);
6302       Type *Op0Ty = Op0->getOperand(0)->getType();
6303       Type *Op1Ty = Op1->getOperand(0)->getType();
6304       Type *LargestOpTy =
6305           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6306                                                                     : Op0Ty;
6307       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6308 
6309       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6310       // different sizes. We take the largest type as the ext to reduce, and add
6311       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6312       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6313           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6314           TTI::CastContextHint::None, CostKind, Op0);
6315       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6316           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6317           TTI::CastContextHint::None, CostKind, Op1);
6318       InstructionCost MulCost =
6319           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6320 
6321       InstructionCost RedCost = TTI.getMulAccReductionCost(
6322           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6323       InstructionCost ExtraExtCost = 0;
6324       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6325         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6326         ExtraExtCost = TTI.getCastInstrCost(
6327             ExtraExtOp->getOpcode(), ExtType,
6328             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6329             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6330       }
6331 
6332       if (RedCost.isValid() &&
6333           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6334         return I == RetI ? RedCost : 0;
6335     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6336       // Matched reduce.add(mul())
6337       InstructionCost MulCost =
6338           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6339 
6340       InstructionCost RedCost = TTI.getMulAccReductionCost(
6341           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6342 
6343       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6344         return I == RetI ? RedCost : 0;
6345     }
6346   }
6347 
6348   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6349 }
6350 
6351 InstructionCost
6352 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6353                                                      ElementCount VF) {
6354   // Calculate scalar cost only. Vectorization cost should be ready at this
6355   // moment.
6356   if (VF.isScalar()) {
6357     Type *ValTy = getLoadStoreType(I);
6358     const Align Alignment = getLoadStoreAlignment(I);
6359     unsigned AS = getLoadStoreAddressSpace(I);
6360 
6361     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6362     return TTI.getAddressComputationCost(ValTy) +
6363            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6364                                TTI::TCK_RecipThroughput, OpInfo, I);
6365   }
6366   return getWideningCost(I, VF);
6367 }
6368 
6369 LoopVectorizationCostModel::VectorizationCostTy
6370 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6371                                                ElementCount VF) {
6372   // If we know that this instruction will remain uniform, check the cost of
6373   // the scalar version.
6374   if (isUniformAfterVectorization(I, VF))
6375     VF = ElementCount::getFixed(1);
6376 
6377   if (VF.isVector() && isProfitableToScalarize(I, VF))
6378     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6379 
6380   // Forced scalars do not have any scalarization overhead.
6381   auto ForcedScalar = ForcedScalars.find(VF);
6382   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6383     auto InstSet = ForcedScalar->second;
6384     if (InstSet.count(I))
6385       return VectorizationCostTy(
6386           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6387            VF.getKnownMinValue()),
6388           false);
6389   }
6390 
6391   Type *VectorTy;
6392   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6393 
6394   bool TypeNotScalarized = false;
6395   if (VF.isVector() && VectorTy->isVectorTy()) {
6396     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6397       if (VF.isScalable())
6398         // <vscale x 1 x iN> is assumed to be profitable over iN because
6399         // scalable registers are a distinct register class from scalar ones.
6400         // If we ever find a target which wants to lower scalable vectors
6401         // back to scalars, we'll need to update this code to explicitly
6402         // ask TTI about the register class uses for each part.
6403         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6404       else
6405         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6406     } else
6407       C = InstructionCost::getInvalid();
6408   }
6409   return VectorizationCostTy(C, TypeNotScalarized);
6410 }
6411 
6412 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6413     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6414 
6415   // There is no mechanism yet to create a scalable scalarization loop,
6416   // so this is currently Invalid.
6417   if (VF.isScalable())
6418     return InstructionCost::getInvalid();
6419 
6420   if (VF.isScalar())
6421     return 0;
6422 
6423   InstructionCost Cost = 0;
6424   Type *RetTy = ToVectorTy(I->getType(), VF);
6425   if (!RetTy->isVoidTy() &&
6426       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6427     Cost += TTI.getScalarizationOverhead(
6428         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6429         /*Insert*/ true,
6430         /*Extract*/ false, CostKind);
6431 
6432   // Some targets keep addresses scalar.
6433   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6434     return Cost;
6435 
6436   // Some targets support efficient element stores.
6437   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6438     return Cost;
6439 
6440   // Collect operands to consider.
6441   CallInst *CI = dyn_cast<CallInst>(I);
6442   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6443 
6444   // Skip operands that do not require extraction/scalarization and do not incur
6445   // any overhead.
6446   SmallVector<Type *> Tys;
6447   for (auto *V : filterExtractingOperands(Ops, VF))
6448     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6449   return Cost + TTI.getOperandsScalarizationOverhead(
6450                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6451 }
6452 
6453 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6454   if (VF.isScalar())
6455     return;
6456   NumPredStores = 0;
6457   for (BasicBlock *BB : TheLoop->blocks()) {
6458     // For each instruction in the old loop.
6459     for (Instruction &I : *BB) {
6460       Value *Ptr =  getLoadStorePointerOperand(&I);
6461       if (!Ptr)
6462         continue;
6463 
6464       // TODO: We should generate better code and update the cost model for
6465       // predicated uniform stores. Today they are treated as any other
6466       // predicated store (see added test cases in
6467       // invariant-store-vectorization.ll).
6468       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6469         NumPredStores++;
6470 
6471       if (Legal->isUniformMemOp(I, VF)) {
6472         auto isLegalToScalarize = [&]() {
6473           if (!VF.isScalable())
6474             // Scalarization of fixed length vectors "just works".
6475             return true;
6476 
6477           // We have dedicated lowering for unpredicated uniform loads and
6478           // stores.  Note that even with tail folding we know that at least
6479           // one lane is active (i.e. generalized predication is not possible
6480           // here), and the logic below depends on this fact.
6481           if (!foldTailByMasking())
6482             return true;
6483 
6484           // For scalable vectors, a uniform memop load is always
6485           // uniform-by-parts  and we know how to scalarize that.
6486           if (isa<LoadInst>(I))
6487             return true;
6488 
6489           // A uniform store isn't neccessarily uniform-by-part
6490           // and we can't assume scalarization.
6491           auto &SI = cast<StoreInst>(I);
6492           return TheLoop->isLoopInvariant(SI.getValueOperand());
6493         };
6494 
6495         const InstructionCost GatherScatterCost =
6496           isLegalGatherOrScatter(&I, VF) ?
6497           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6498 
6499         // Load: Scalar load + broadcast
6500         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6501         // FIXME: This cost is a significant under-estimate for tail folded
6502         // memory ops.
6503         const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6504           getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6505 
6506         // Choose better solution for the current VF,  Note that Invalid
6507         // costs compare as maximumal large.  If both are invalid, we get
6508         // scalable invalid which signals a failure and a vectorization abort.
6509         if (GatherScatterCost < ScalarizationCost)
6510           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6511         else
6512           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6513         continue;
6514       }
6515 
6516       // We assume that widening is the best solution when possible.
6517       if (memoryInstructionCanBeWidened(&I, VF)) {
6518         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6519         int ConsecutiveStride = Legal->isConsecutivePtr(
6520             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6521         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6522                "Expected consecutive stride.");
6523         InstWidening Decision =
6524             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6525         setWideningDecision(&I, VF, Decision, Cost);
6526         continue;
6527       }
6528 
6529       // Choose between Interleaving, Gather/Scatter or Scalarization.
6530       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6531       unsigned NumAccesses = 1;
6532       if (isAccessInterleaved(&I)) {
6533         auto Group = getInterleavedAccessGroup(&I);
6534         assert(Group && "Fail to get an interleaved access group.");
6535 
6536         // Make one decision for the whole group.
6537         if (getWideningDecision(&I, VF) != CM_Unknown)
6538           continue;
6539 
6540         NumAccesses = Group->getNumMembers();
6541         if (interleavedAccessCanBeWidened(&I, VF))
6542           InterleaveCost = getInterleaveGroupCost(&I, VF);
6543       }
6544 
6545       InstructionCost GatherScatterCost =
6546           isLegalGatherOrScatter(&I, VF)
6547               ? getGatherScatterCost(&I, VF) * NumAccesses
6548               : InstructionCost::getInvalid();
6549 
6550       InstructionCost ScalarizationCost =
6551           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6552 
6553       // Choose better solution for the current VF,
6554       // write down this decision and use it during vectorization.
6555       InstructionCost Cost;
6556       InstWidening Decision;
6557       if (InterleaveCost <= GatherScatterCost &&
6558           InterleaveCost < ScalarizationCost) {
6559         Decision = CM_Interleave;
6560         Cost = InterleaveCost;
6561       } else if (GatherScatterCost < ScalarizationCost) {
6562         Decision = CM_GatherScatter;
6563         Cost = GatherScatterCost;
6564       } else {
6565         Decision = CM_Scalarize;
6566         Cost = ScalarizationCost;
6567       }
6568       // If the instructions belongs to an interleave group, the whole group
6569       // receives the same decision. The whole group receives the cost, but
6570       // the cost will actually be assigned to one instruction.
6571       if (auto Group = getInterleavedAccessGroup(&I))
6572         setWideningDecision(Group, VF, Decision, Cost);
6573       else
6574         setWideningDecision(&I, VF, Decision, Cost);
6575     }
6576   }
6577 
6578   // Make sure that any load of address and any other address computation
6579   // remains scalar unless there is gather/scatter support. This avoids
6580   // inevitable extracts into address registers, and also has the benefit of
6581   // activating LSR more, since that pass can't optimize vectorized
6582   // addresses.
6583   if (TTI.prefersVectorizedAddressing())
6584     return;
6585 
6586   // Start with all scalar pointer uses.
6587   SmallPtrSet<Instruction *, 8> AddrDefs;
6588   for (BasicBlock *BB : TheLoop->blocks())
6589     for (Instruction &I : *BB) {
6590       Instruction *PtrDef =
6591         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6592       if (PtrDef && TheLoop->contains(PtrDef) &&
6593           getWideningDecision(&I, VF) != CM_GatherScatter)
6594         AddrDefs.insert(PtrDef);
6595     }
6596 
6597   // Add all instructions used to generate the addresses.
6598   SmallVector<Instruction *, 4> Worklist;
6599   append_range(Worklist, AddrDefs);
6600   while (!Worklist.empty()) {
6601     Instruction *I = Worklist.pop_back_val();
6602     for (auto &Op : I->operands())
6603       if (auto *InstOp = dyn_cast<Instruction>(Op))
6604         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6605             AddrDefs.insert(InstOp).second)
6606           Worklist.push_back(InstOp);
6607   }
6608 
6609   for (auto *I : AddrDefs) {
6610     if (isa<LoadInst>(I)) {
6611       // Setting the desired widening decision should ideally be handled in
6612       // by cost functions, but since this involves the task of finding out
6613       // if the loaded register is involved in an address computation, it is
6614       // instead changed here when we know this is the case.
6615       InstWidening Decision = getWideningDecision(I, VF);
6616       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6617         // Scalarize a widened load of address.
6618         setWideningDecision(
6619             I, VF, CM_Scalarize,
6620             (VF.getKnownMinValue() *
6621              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6622       else if (auto Group = getInterleavedAccessGroup(I)) {
6623         // Scalarize an interleave group of address loads.
6624         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6625           if (Instruction *Member = Group->getMember(I))
6626             setWideningDecision(
6627                 Member, VF, CM_Scalarize,
6628                 (VF.getKnownMinValue() *
6629                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6630         }
6631       }
6632     } else
6633       // Make sure I gets scalarized and a cost estimate without
6634       // scalarization overhead.
6635       ForcedScalars[VF].insert(I);
6636   }
6637 }
6638 
6639 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6640   assert(!VF.isScalar() &&
6641          "Trying to set a vectorization decision for a scalar VF");
6642 
6643   for (BasicBlock *BB : TheLoop->blocks()) {
6644     // For each instruction in the old loop.
6645     for (Instruction &I : *BB) {
6646       CallInst *CI = dyn_cast<CallInst>(&I);
6647 
6648       if (!CI)
6649         continue;
6650 
6651       InstructionCost ScalarCost = InstructionCost::getInvalid();
6652       InstructionCost VectorCost = InstructionCost::getInvalid();
6653       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6654       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6655 
6656       Function *ScalarFunc = CI->getCalledFunction();
6657       Type *ScalarRetTy = CI->getType();
6658       SmallVector<Type *, 4> Tys, ScalarTys;
6659       bool MaskRequired = Legal->isMaskRequired(CI);
6660       for (auto &ArgOp : CI->args())
6661         ScalarTys.push_back(ArgOp->getType());
6662 
6663       // Compute corresponding vector type for return value and arguments.
6664       Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6665       for (Type *ScalarTy : ScalarTys)
6666         Tys.push_back(ToVectorTy(ScalarTy, VF));
6667 
6668       // An in-loop reduction using an fmuladd intrinsic is a special case;
6669       // we don't want the normal cost for that intrinsic.
6670       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6671         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6672           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6673                                   getVectorIntrinsicIDForCall(CI, TLI),
6674                                   std::nullopt, *RedCost);
6675           continue;
6676         }
6677 
6678       // Estimate cost of scalarized vector call. The source operands are
6679       // assumed to be vectors, so we need to extract individual elements from
6680       // there, execute VF scalar calls, and then gather the result into the
6681       // vector return value.
6682       InstructionCost ScalarCallCost =
6683           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6684 
6685       // Compute costs of unpacking argument values for the scalar calls and
6686       // packing the return values to a vector.
6687       InstructionCost ScalarizationCost =
6688           getScalarizationOverhead(CI, VF, CostKind);
6689 
6690       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6691 
6692       // Find the cost of vectorizing the call, if we can find a suitable
6693       // vector variant of the function.
6694       bool UsesMask = false;
6695       VFInfo FuncInfo;
6696       Function *VecFunc = nullptr;
6697       // Search through any available variants for one we can use at this VF.
6698       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6699         // Must match requested VF.
6700         if (Info.Shape.VF != VF)
6701           continue;
6702 
6703         // Must take a mask argument if one is required
6704         if (MaskRequired && !Info.isMasked())
6705           continue;
6706 
6707         // Check that all parameter kinds are supported
6708         bool ParamsOk = true;
6709         for (VFParameter Param : Info.Shape.Parameters) {
6710           switch (Param.ParamKind) {
6711           case VFParamKind::Vector:
6712             break;
6713           case VFParamKind::OMP_Uniform: {
6714             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6715             // Make sure the scalar parameter in the loop is invariant.
6716             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6717                                               TheLoop))
6718               ParamsOk = false;
6719             break;
6720           }
6721           case VFParamKind::OMP_Linear: {
6722             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6723             // Find the stride for the scalar parameter in this loop and see if
6724             // it matches the stride for the variant.
6725             // TODO: do we need to figure out the cost of an extract to get the
6726             // first lane? Or do we hope that it will be folded away?
6727             ScalarEvolution *SE = PSE.getSE();
6728             const auto *SAR =
6729                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6730 
6731             if (!SAR || SAR->getLoop() != TheLoop) {
6732               ParamsOk = false;
6733               break;
6734             }
6735 
6736             const SCEVConstant *Step =
6737                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6738 
6739             if (!Step ||
6740                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6741               ParamsOk = false;
6742 
6743             break;
6744           }
6745           case VFParamKind::GlobalPredicate:
6746             UsesMask = true;
6747             break;
6748           default:
6749             ParamsOk = false;
6750             break;
6751           }
6752         }
6753 
6754         if (!ParamsOk)
6755           continue;
6756 
6757         // Found a suitable candidate, stop here.
6758         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6759         FuncInfo = Info;
6760         break;
6761       }
6762 
6763       // Add in the cost of synthesizing a mask if one wasn't required.
6764       InstructionCost MaskCost = 0;
6765       if (VecFunc && UsesMask && !MaskRequired)
6766         MaskCost = TTI.getShuffleCost(
6767             TargetTransformInfo::SK_Broadcast,
6768             VectorType::get(IntegerType::getInt1Ty(
6769                                 VecFunc->getFunctionType()->getContext()),
6770                             VF));
6771 
6772       if (TLI && VecFunc && !CI->isNoBuiltin())
6773         VectorCost =
6774             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6775 
6776       // Find the cost of an intrinsic; some targets may have instructions that
6777       // perform the operation without needing an actual call.
6778       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6779       if (IID != Intrinsic::not_intrinsic)
6780         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6781 
6782       InstructionCost Cost = ScalarCost;
6783       InstWidening Decision = CM_Scalarize;
6784 
6785       if (VectorCost <= Cost) {
6786         Cost = VectorCost;
6787         Decision = CM_VectorCall;
6788       }
6789 
6790       if (IntrinsicCost <= Cost) {
6791         Cost = IntrinsicCost;
6792         Decision = CM_IntrinsicCall;
6793       }
6794 
6795       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6796                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6797     }
6798   }
6799 }
6800 
6801 InstructionCost
6802 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6803                                                Type *&VectorTy) {
6804   Type *RetTy = I->getType();
6805   if (canTruncateToMinimalBitwidth(I, VF))
6806     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6807   auto SE = PSE.getSE();
6808   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6809 
6810   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6811                                                 ElementCount VF) -> bool {
6812     if (VF.isScalar())
6813       return true;
6814 
6815     auto Scalarized = InstsToScalarize.find(VF);
6816     assert(Scalarized != InstsToScalarize.end() &&
6817            "VF not yet analyzed for scalarization profitability");
6818     return !Scalarized->second.count(I) &&
6819            llvm::all_of(I->users(), [&](User *U) {
6820              auto *UI = cast<Instruction>(U);
6821              return !Scalarized->second.count(UI);
6822            });
6823   };
6824   (void) hasSingleCopyAfterVectorization;
6825 
6826   if (isScalarAfterVectorization(I, VF)) {
6827     // With the exception of GEPs and PHIs, after scalarization there should
6828     // only be one copy of the instruction generated in the loop. This is
6829     // because the VF is either 1, or any instructions that need scalarizing
6830     // have already been dealt with by the time we get here. As a result,
6831     // it means we don't have to multiply the instruction cost by VF.
6832     assert(I->getOpcode() == Instruction::GetElementPtr ||
6833            I->getOpcode() == Instruction::PHI ||
6834            (I->getOpcode() == Instruction::BitCast &&
6835             I->getType()->isPointerTy()) ||
6836            hasSingleCopyAfterVectorization(I, VF));
6837     VectorTy = RetTy;
6838   } else
6839     VectorTy = ToVectorTy(RetTy, VF);
6840 
6841   // TODO: We need to estimate the cost of intrinsic calls.
6842   switch (I->getOpcode()) {
6843   case Instruction::GetElementPtr:
6844     // We mark this instruction as zero-cost because the cost of GEPs in
6845     // vectorized code depends on whether the corresponding memory instruction
6846     // is scalarized or not. Therefore, we handle GEPs with the memory
6847     // instruction cost.
6848     return 0;
6849   case Instruction::Br: {
6850     // In cases of scalarized and predicated instructions, there will be VF
6851     // predicated blocks in the vectorized loop. Each branch around these
6852     // blocks requires also an extract of its vector compare i1 element.
6853     bool ScalarPredicatedBB = false;
6854     BranchInst *BI = cast<BranchInst>(I);
6855     if (VF.isVector() && BI->isConditional() &&
6856         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6857          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6858       ScalarPredicatedBB = true;
6859 
6860     if (ScalarPredicatedBB) {
6861       // Not possible to scalarize scalable vector with predicated instructions.
6862       if (VF.isScalable())
6863         return InstructionCost::getInvalid();
6864       // Return cost for branches around scalarized and predicated blocks.
6865       auto *Vec_i1Ty =
6866           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6867       return (
6868           TTI.getScalarizationOverhead(
6869               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6870               /*Insert*/ false, /*Extract*/ true, CostKind) +
6871           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6872     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6873       // The back-edge branch will remain, as will all scalar branches.
6874       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6875     else
6876       // This branch will be eliminated by if-conversion.
6877       return 0;
6878     // Note: We currently assume zero cost for an unconditional branch inside
6879     // a predicated block since it will become a fall-through, although we
6880     // may decide in the future to call TTI for all branches.
6881   }
6882   case Instruction::PHI: {
6883     auto *Phi = cast<PHINode>(I);
6884 
6885     // First-order recurrences are replaced by vector shuffles inside the loop.
6886     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6887       SmallVector<int> Mask(VF.getKnownMinValue());
6888       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6889       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6890                                 cast<VectorType>(VectorTy), Mask, CostKind,
6891                                 VF.getKnownMinValue() - 1);
6892     }
6893 
6894     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6895     // converted into select instructions. We require N - 1 selects per phi
6896     // node, where N is the number of incoming values.
6897     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6898       return (Phi->getNumIncomingValues() - 1) *
6899              TTI.getCmpSelInstrCost(
6900                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6901                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6902                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6903 
6904     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6905   }
6906   case Instruction::UDiv:
6907   case Instruction::SDiv:
6908   case Instruction::URem:
6909   case Instruction::SRem:
6910     if (VF.isVector() && isPredicatedInst(I)) {
6911       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6912       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6913         ScalarCost : SafeDivisorCost;
6914     }
6915     // We've proven all lanes safe to speculate, fall through.
6916     [[fallthrough]];
6917   case Instruction::Add:
6918   case Instruction::FAdd:
6919   case Instruction::Sub:
6920   case Instruction::FSub:
6921   case Instruction::Mul:
6922   case Instruction::FMul:
6923   case Instruction::FDiv:
6924   case Instruction::FRem:
6925   case Instruction::Shl:
6926   case Instruction::LShr:
6927   case Instruction::AShr:
6928   case Instruction::And:
6929   case Instruction::Or:
6930   case Instruction::Xor: {
6931     // If we're speculating on the stride being 1, the multiplication may
6932     // fold away.  We can generalize this for all operations using the notion
6933     // of neutral elements.  (TODO)
6934     if (I->getOpcode() == Instruction::Mul &&
6935         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6936          PSE.getSCEV(I->getOperand(1))->isOne()))
6937       return 0;
6938 
6939     // Detect reduction patterns
6940     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6941       return *RedCost;
6942 
6943     // Certain instructions can be cheaper to vectorize if they have a constant
6944     // second vector operand. One example of this are shifts on x86.
6945     Value *Op2 = I->getOperand(1);
6946     auto Op2Info = TTI.getOperandInfo(Op2);
6947     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6948         Legal->isInvariant(Op2))
6949       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6950 
6951     SmallVector<const Value *, 4> Operands(I->operand_values());
6952     auto InstrCost = TTI.getArithmeticInstrCost(
6953         I->getOpcode(), VectorTy, CostKind,
6954         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6955         Op2Info, Operands, I);
6956 
6957     // Some targets can replace frem with vector library calls.
6958     InstructionCost VecCallCost = InstructionCost::getInvalid();
6959     if (I->getOpcode() == Instruction::FRem) {
6960       LibFunc Func;
6961       if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
6962           TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
6963         SmallVector<Type *, 4> OpTypes;
6964         for (auto &Op : I->operands())
6965           OpTypes.push_back(Op->getType());
6966         VecCallCost =
6967             TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
6968       }
6969     }
6970     return std::min(InstrCost, VecCallCost);
6971   }
6972   case Instruction::FNeg: {
6973     return TTI.getArithmeticInstrCost(
6974         I->getOpcode(), VectorTy, CostKind,
6975         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6976         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6977         I->getOperand(0), I);
6978   }
6979   case Instruction::Select: {
6980     SelectInst *SI = cast<SelectInst>(I);
6981     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6982     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6983 
6984     const Value *Op0, *Op1;
6985     using namespace llvm::PatternMatch;
6986     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6987                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6988       // select x, y, false --> x & y
6989       // select x, true, y --> x | y
6990       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6991       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6992       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6993               Op1->getType()->getScalarSizeInBits() == 1);
6994 
6995       SmallVector<const Value *, 2> Operands{Op0, Op1};
6996       return TTI.getArithmeticInstrCost(
6997           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6998           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6999     }
7000 
7001     Type *CondTy = SI->getCondition()->getType();
7002     if (!ScalarCond)
7003       CondTy = VectorType::get(CondTy, VF);
7004 
7005     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7006     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7007       Pred = Cmp->getPredicate();
7008     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7009                                   CostKind, I);
7010   }
7011   case Instruction::ICmp:
7012   case Instruction::FCmp: {
7013     Type *ValTy = I->getOperand(0)->getType();
7014     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7015     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7016       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7017     VectorTy = ToVectorTy(ValTy, VF);
7018     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7019                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7020                                   I);
7021   }
7022   case Instruction::Store:
7023   case Instruction::Load: {
7024     ElementCount Width = VF;
7025     if (Width.isVector()) {
7026       InstWidening Decision = getWideningDecision(I, Width);
7027       assert(Decision != CM_Unknown &&
7028              "CM decision should be taken at this point");
7029       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7030         return InstructionCost::getInvalid();
7031       if (Decision == CM_Scalarize)
7032         Width = ElementCount::getFixed(1);
7033     }
7034     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7035     return getMemoryInstructionCost(I, VF);
7036   }
7037   case Instruction::BitCast:
7038     if (I->getType()->isPointerTy())
7039       return 0;
7040     [[fallthrough]];
7041   case Instruction::ZExt:
7042   case Instruction::SExt:
7043   case Instruction::FPToUI:
7044   case Instruction::FPToSI:
7045   case Instruction::FPExt:
7046   case Instruction::PtrToInt:
7047   case Instruction::IntToPtr:
7048   case Instruction::SIToFP:
7049   case Instruction::UIToFP:
7050   case Instruction::Trunc:
7051   case Instruction::FPTrunc: {
7052     // Computes the CastContextHint from a Load/Store instruction.
7053     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7054       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7055              "Expected a load or a store!");
7056 
7057       if (VF.isScalar() || !TheLoop->contains(I))
7058         return TTI::CastContextHint::Normal;
7059 
7060       switch (getWideningDecision(I, VF)) {
7061       case LoopVectorizationCostModel::CM_GatherScatter:
7062         return TTI::CastContextHint::GatherScatter;
7063       case LoopVectorizationCostModel::CM_Interleave:
7064         return TTI::CastContextHint::Interleave;
7065       case LoopVectorizationCostModel::CM_Scalarize:
7066       case LoopVectorizationCostModel::CM_Widen:
7067         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7068                                         : TTI::CastContextHint::Normal;
7069       case LoopVectorizationCostModel::CM_Widen_Reverse:
7070         return TTI::CastContextHint::Reversed;
7071       case LoopVectorizationCostModel::CM_Unknown:
7072         llvm_unreachable("Instr did not go through cost modelling?");
7073       case LoopVectorizationCostModel::CM_VectorCall:
7074       case LoopVectorizationCostModel::CM_IntrinsicCall:
7075         llvm_unreachable_internal("Instr has invalid widening decision");
7076       }
7077 
7078       llvm_unreachable("Unhandled case!");
7079     };
7080 
7081     unsigned Opcode = I->getOpcode();
7082     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7083     // For Trunc, the context is the only user, which must be a StoreInst.
7084     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7085       if (I->hasOneUse())
7086         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7087           CCH = ComputeCCH(Store);
7088     }
7089     // For Z/Sext, the context is the operand, which must be a LoadInst.
7090     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7091              Opcode == Instruction::FPExt) {
7092       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7093         CCH = ComputeCCH(Load);
7094     }
7095 
7096     // We optimize the truncation of induction variables having constant
7097     // integer steps. The cost of these truncations is the same as the scalar
7098     // operation.
7099     if (isOptimizableIVTruncate(I, VF)) {
7100       auto *Trunc = cast<TruncInst>(I);
7101       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7102                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7103     }
7104 
7105     // Detect reduction patterns
7106     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7107       return *RedCost;
7108 
7109     Type *SrcScalarTy = I->getOperand(0)->getType();
7110     Type *SrcVecTy =
7111         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7112     if (canTruncateToMinimalBitwidth(I, VF)) {
7113       // This cast is going to be shrunk. This may remove the cast or it might
7114       // turn it into slightly different cast. For example, if MinBW == 16,
7115       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7116       //
7117       // Calculate the modified src and dest types.
7118       Type *MinVecTy = VectorTy;
7119       if (Opcode == Instruction::Trunc) {
7120         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7121         VectorTy =
7122             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7123       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7124         // Leave SrcVecTy unchanged - we only shrink the destination element
7125         // type.
7126         VectorTy =
7127             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7128       }
7129     }
7130 
7131     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7132   }
7133   case Instruction::Call:
7134     return getVectorCallCost(cast<CallInst>(I), VF);
7135   case Instruction::ExtractValue:
7136     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7137   case Instruction::Alloca:
7138     // We cannot easily widen alloca to a scalable alloca, as
7139     // the result would need to be a vector of pointers.
7140     if (VF.isScalable())
7141       return InstructionCost::getInvalid();
7142     [[fallthrough]];
7143   default:
7144     // This opcode is unknown. Assume that it is the same as 'mul'.
7145     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7146   } // end of switch.
7147 }
7148 
7149 void LoopVectorizationCostModel::collectValuesToIgnore() {
7150   // Ignore ephemeral values.
7151   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7152 
7153   // Find all stores to invariant variables. Since they are going to sink
7154   // outside the loop we do not need calculate cost for them.
7155   for (BasicBlock *BB : TheLoop->blocks())
7156     for (Instruction &I : *BB) {
7157       StoreInst *SI;
7158       if ((SI = dyn_cast<StoreInst>(&I)) &&
7159           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7160         ValuesToIgnore.insert(&I);
7161     }
7162 
7163   // Ignore type-promoting instructions we identified during reduction
7164   // detection.
7165   for (const auto &Reduction : Legal->getReductionVars()) {
7166     const RecurrenceDescriptor &RedDes = Reduction.second;
7167     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7168     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7169   }
7170   // Ignore type-casting instructions we identified during induction
7171   // detection.
7172   for (const auto &Induction : Legal->getInductionVars()) {
7173     const InductionDescriptor &IndDes = Induction.second;
7174     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7175     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7176   }
7177 }
7178 
7179 void LoopVectorizationCostModel::collectInLoopReductions() {
7180   for (const auto &Reduction : Legal->getReductionVars()) {
7181     PHINode *Phi = Reduction.first;
7182     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7183 
7184     // We don't collect reductions that are type promoted (yet).
7185     if (RdxDesc.getRecurrenceType() != Phi->getType())
7186       continue;
7187 
7188     // If the target would prefer this reduction to happen "in-loop", then we
7189     // want to record it as such.
7190     unsigned Opcode = RdxDesc.getOpcode();
7191     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7192         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7193                                    TargetTransformInfo::ReductionFlags()))
7194       continue;
7195 
7196     // Check that we can correctly put the reductions into the loop, by
7197     // finding the chain of operations that leads from the phi to the loop
7198     // exit value.
7199     SmallVector<Instruction *, 4> ReductionOperations =
7200         RdxDesc.getReductionOpChain(Phi, TheLoop);
7201     bool InLoop = !ReductionOperations.empty();
7202 
7203     if (InLoop) {
7204       InLoopReductions.insert(Phi);
7205       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7206       Instruction *LastChain = Phi;
7207       for (auto *I : ReductionOperations) {
7208         InLoopReductionImmediateChains[I] = LastChain;
7209         LastChain = I;
7210       }
7211     }
7212     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7213                       << " reduction for phi: " << *Phi << "\n");
7214   }
7215 }
7216 
7217 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
7218                                DebugLoc DL, const Twine &Name) {
7219   assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7220          Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7221   return tryInsertInstruction(
7222       new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7223 }
7224 
7225 // This function will select a scalable VF if the target supports scalable
7226 // vectors and a fixed one otherwise.
7227 // TODO: we could return a pair of values that specify the max VF and
7228 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7229 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7230 // doesn't have a cost model that can choose which plan to execute if
7231 // more than one is generated.
7232 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7233                                      LoopVectorizationCostModel &CM) {
7234   unsigned WidestType;
7235   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7236 
7237   TargetTransformInfo::RegisterKind RegKind =
7238       TTI.enableScalableVectorization()
7239           ? TargetTransformInfo::RGK_ScalableVector
7240           : TargetTransformInfo::RGK_FixedWidthVector;
7241 
7242   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7243   unsigned N = RegSize.getKnownMinValue() / WidestType;
7244   return ElementCount::get(N, RegSize.isScalable());
7245 }
7246 
7247 VectorizationFactor
7248 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7249   ElementCount VF = UserVF;
7250   // Outer loop handling: They may require CFG and instruction level
7251   // transformations before even evaluating whether vectorization is profitable.
7252   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7253   // the vectorization pipeline.
7254   if (!OrigLoop->isInnermost()) {
7255     // If the user doesn't provide a vectorization factor, determine a
7256     // reasonable one.
7257     if (UserVF.isZero()) {
7258       VF = determineVPlanVF(TTI, CM);
7259       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7260 
7261       // Make sure we have a VF > 1 for stress testing.
7262       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7263         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7264                           << "overriding computed VF.\n");
7265         VF = ElementCount::getFixed(4);
7266       }
7267     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7268                !ForceTargetSupportsScalableVectors) {
7269       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7270                         << "not supported by the target.\n");
7271       reportVectorizationFailure(
7272           "Scalable vectorization requested but not supported by the target",
7273           "the scalable user-specified vectorization width for outer-loop "
7274           "vectorization cannot be used because the target does not support "
7275           "scalable vectors.",
7276           "ScalableVFUnfeasible", ORE, OrigLoop);
7277       return VectorizationFactor::Disabled();
7278     }
7279     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7280     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7281            "VF needs to be a power of two");
7282     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7283                       << "VF " << VF << " to build VPlans.\n");
7284     buildVPlans(VF, VF);
7285 
7286     // For VPlan build stress testing, we bail out after VPlan construction.
7287     if (VPlanBuildStressTest)
7288       return VectorizationFactor::Disabled();
7289 
7290     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7291   }
7292 
7293   LLVM_DEBUG(
7294       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7295                 "VPlan-native path.\n");
7296   return VectorizationFactor::Disabled();
7297 }
7298 
7299 std::optional<VectorizationFactor>
7300 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7301   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7302   CM.collectValuesToIgnore();
7303   CM.collectElementTypesForWidening();
7304 
7305   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7306   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7307     return std::nullopt;
7308 
7309   // Invalidate interleave groups if all blocks of loop will be predicated.
7310   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7311       !useMaskedInterleavedAccesses(TTI)) {
7312     LLVM_DEBUG(
7313         dbgs()
7314         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7315            "which requires masked-interleaved support.\n");
7316     if (CM.InterleaveInfo.invalidateGroups())
7317       // Invalidating interleave groups also requires invalidating all decisions
7318       // based on them, which includes widening decisions and uniform and scalar
7319       // values.
7320       CM.invalidateCostModelingDecisions();
7321   }
7322 
7323   ElementCount MaxUserVF =
7324       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7325   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7326   if (!UserVF.isZero() && UserVFIsLegal) {
7327     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7328            "VF needs to be a power of two");
7329     // Collect the instructions (and their associated costs) that will be more
7330     // profitable to scalarize.
7331     CM.collectInLoopReductions();
7332     if (CM.selectUserVectorizationFactor(UserVF)) {
7333       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7334       buildVPlansWithVPRecipes(UserVF, UserVF);
7335       if (!hasPlanWithVF(UserVF)) {
7336         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7337                           << ".\n");
7338         return std::nullopt;
7339       }
7340 
7341       LLVM_DEBUG(printPlans(dbgs()));
7342       return {{UserVF, 0, 0}};
7343     } else
7344       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7345                               "InvalidCost", ORE, OrigLoop);
7346   }
7347 
7348   // Populate the set of Vectorization Factor Candidates.
7349   ElementCountSet VFCandidates;
7350   for (auto VF = ElementCount::getFixed(1);
7351        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7352     VFCandidates.insert(VF);
7353   for (auto VF = ElementCount::getScalable(1);
7354        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7355     VFCandidates.insert(VF);
7356 
7357   CM.collectInLoopReductions();
7358   for (const auto &VF : VFCandidates) {
7359     // Collect Uniform and Scalar instructions after vectorization with VF.
7360     CM.collectUniformsAndScalars(VF);
7361 
7362     // Collect the instructions (and their associated costs) that will be more
7363     // profitable to scalarize.
7364     if (VF.isVector())
7365       CM.collectInstsToScalarize(VF);
7366   }
7367 
7368   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7369   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7370 
7371   LLVM_DEBUG(printPlans(dbgs()));
7372   if (!MaxFactors.hasVector())
7373     return VectorizationFactor::Disabled();
7374 
7375   // Select the optimal vectorization factor.
7376   VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7377   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7378   if (!hasPlanWithVF(VF.Width)) {
7379     LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7380                       << ".\n");
7381     return std::nullopt;
7382   }
7383   return VF;
7384 }
7385 
7386 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7387   assert(count_if(VPlans,
7388                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7389              1 &&
7390          "Best VF has not a single VPlan.");
7391 
7392   for (const VPlanPtr &Plan : VPlans) {
7393     if (Plan->hasVF(VF))
7394       return *Plan.get();
7395   }
7396   llvm_unreachable("No plan found!");
7397 }
7398 
7399 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7400   SmallVector<Metadata *, 4> MDs;
7401   // Reserve first location for self reference to the LoopID metadata node.
7402   MDs.push_back(nullptr);
7403   bool IsUnrollMetadata = false;
7404   MDNode *LoopID = L->getLoopID();
7405   if (LoopID) {
7406     // First find existing loop unrolling disable metadata.
7407     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7408       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7409       if (MD) {
7410         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7411         IsUnrollMetadata =
7412             S && S->getString().starts_with("llvm.loop.unroll.disable");
7413       }
7414       MDs.push_back(LoopID->getOperand(i));
7415     }
7416   }
7417 
7418   if (!IsUnrollMetadata) {
7419     // Add runtime unroll disable metadata.
7420     LLVMContext &Context = L->getHeader()->getContext();
7421     SmallVector<Metadata *, 1> DisableOperands;
7422     DisableOperands.push_back(
7423         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7424     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7425     MDs.push_back(DisableNode);
7426     MDNode *NewLoopID = MDNode::get(Context, MDs);
7427     // Set operand 0 to refer to the loop id itself.
7428     NewLoopID->replaceOperandWith(0, NewLoopID);
7429     L->setLoopID(NewLoopID);
7430   }
7431 }
7432 
7433 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7434 // create a merge phi node for it and add it to \p ReductionResumeValues.
7435 static void createAndCollectMergePhiForReduction(
7436     VPInstruction *RedResult,
7437     DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7438     VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7439   if (!RedResult ||
7440       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7441     return;
7442 
7443   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7444   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7445 
7446   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7447   Value *FinalValue =
7448       State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7449   auto *ResumePhi =
7450       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7451 
7452   // TODO: bc.merge.rdx should not be created here, instead it should be
7453   // modeled in VPlan.
7454   BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7455   // Create a phi node that merges control-flow from the backedge-taken check
7456   // block and the middle block.
7457   auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7458                                      LoopScalarPreHeader->getTerminator());
7459 
7460   // If we are fixing reductions in the epilogue loop then we should already
7461   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7462   // we carry over the incoming values correctly.
7463   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7464     if (Incoming == LoopMiddleBlock)
7465       BCBlockPhi->addIncoming(FinalValue, Incoming);
7466     else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7467       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7468                               Incoming);
7469     else
7470       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7471   }
7472 
7473   auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7474   // TODO: This fixup should instead be modeled in VPlan.
7475   // Fix the scalar loop reduction variable with the incoming reduction sum
7476   // from the vector body and from the backedge value.
7477   int IncomingEdgeBlockIdx =
7478       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7479   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7480   // Pick the other block.
7481   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7482   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7483   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7484   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7485 
7486   ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7487 }
7488 
7489 std::pair<DenseMap<const SCEV *, Value *>,
7490           DenseMap<const RecurrenceDescriptor *, Value *>>
7491 LoopVectorizationPlanner::executePlan(
7492     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7493     InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7494     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7495   assert(BestVPlan.hasVF(BestVF) &&
7496          "Trying to execute plan with unsupported VF");
7497   assert(BestVPlan.hasUF(BestUF) &&
7498          "Trying to execute plan with unsupported UF");
7499   assert(
7500       (IsEpilogueVectorization || !ExpandedSCEVs) &&
7501       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7502 
7503   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7504                     << '\n');
7505 
7506   if (!IsEpilogueVectorization)
7507     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7508 
7509   // Perform the actual loop transformation.
7510   VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7511                          OrigLoop->getHeader()->getContext());
7512 
7513   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7514   // before making any changes to the CFG.
7515   if (!BestVPlan.getPreheader()->empty()) {
7516     State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7517     State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7518     BestVPlan.getPreheader()->execute(&State);
7519   }
7520   if (!ILV.getTripCount())
7521     ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7522   else
7523     assert(IsEpilogueVectorization && "should only re-use the existing trip "
7524                                       "count during epilogue vectorization");
7525 
7526   // 1. Set up the skeleton for vectorization, including vector pre-header and
7527   // middle block. The vector loop is created during VPlan execution.
7528   Value *CanonicalIVStartValue;
7529   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7530       ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7531                                                      : State.ExpandedSCEVs);
7532 
7533   // Only use noalias metadata when using memory checks guaranteeing no overlap
7534   // across all iterations.
7535   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7536   std::unique_ptr<LoopVersioning> LVer = nullptr;
7537   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7538       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7539 
7540     //  We currently don't use LoopVersioning for the actual loop cloning but we
7541     //  still use it to add the noalias metadata.
7542     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7543     //        metadata.
7544     LVer = std::make_unique<LoopVersioning>(
7545         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7546         PSE.getSE());
7547     State.LVer = &*LVer;
7548     State.LVer->prepareNoAliasMetadata();
7549   }
7550 
7551   ILV.collectPoisonGeneratingRecipes(State);
7552 
7553   ILV.printDebugTracesAtStart();
7554 
7555   //===------------------------------------------------===//
7556   //
7557   // Notice: any optimization or new instruction that go
7558   // into the code below should also be implemented in
7559   // the cost-model.
7560   //
7561   //===------------------------------------------------===//
7562 
7563   // 2. Copy and widen instructions from the old loop into the new loop.
7564   BestVPlan.prepareToExecute(ILV.getTripCount(),
7565                              ILV.getOrCreateVectorTripCount(nullptr),
7566                              CanonicalIVStartValue, State);
7567 
7568   BestVPlan.execute(&State);
7569 
7570   // 2.5 Collect reduction resume values.
7571   DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7572   auto *ExitVPBB =
7573       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7574   for (VPRecipeBase &R : *ExitVPBB) {
7575     createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7576                                          ReductionResumeValues, State, OrigLoop,
7577                                          State.CFG.VPBB2IRBB[ExitVPBB]);
7578   }
7579 
7580   // 2.6. Maintain Loop Hints
7581   // Keep all loop hints from the original loop on the vector loop (we'll
7582   // replace the vectorizer-specific hints below).
7583   MDNode *OrigLoopID = OrigLoop->getLoopID();
7584 
7585   std::optional<MDNode *> VectorizedLoopID =
7586       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7587                                       LLVMLoopVectorizeFollowupVectorized});
7588 
7589   VPBasicBlock *HeaderVPBB =
7590       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7591   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7592   if (VectorizedLoopID)
7593     L->setLoopID(*VectorizedLoopID);
7594   else {
7595     // Keep all loop hints from the original loop on the vector loop (we'll
7596     // replace the vectorizer-specific hints below).
7597     if (MDNode *LID = OrigLoop->getLoopID())
7598       L->setLoopID(LID);
7599 
7600     LoopVectorizeHints Hints(L, true, *ORE);
7601     Hints.setAlreadyVectorized();
7602   }
7603   TargetTransformInfo::UnrollingPreferences UP;
7604   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7605   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7606     AddRuntimeUnrollDisableMetaData(L);
7607 
7608   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7609   //    predication, updating analyses.
7610   ILV.fixVectorizedLoop(State, BestVPlan);
7611 
7612   ILV.printDebugTracesAtEnd();
7613 
7614   return {State.ExpandedSCEVs, ReductionResumeValues};
7615 }
7616 
7617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7618 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7619   for (const auto &Plan : VPlans)
7620     if (PrintVPlansInDotFormat)
7621       Plan->printDOT(O);
7622     else
7623       Plan->print(O);
7624 }
7625 #endif
7626 
7627 //===--------------------------------------------------------------------===//
7628 // EpilogueVectorizerMainLoop
7629 //===--------------------------------------------------------------------===//
7630 
7631 /// This function is partially responsible for generating the control flow
7632 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7633 std::pair<BasicBlock *, Value *>
7634 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7635     const SCEV2ValueTy &ExpandedSCEVs) {
7636   createVectorLoopSkeleton("");
7637 
7638   // Generate the code to check the minimum iteration count of the vector
7639   // epilogue (see below).
7640   EPI.EpilogueIterationCountCheck =
7641       emitIterationCountCheck(LoopScalarPreHeader, true);
7642   EPI.EpilogueIterationCountCheck->setName("iter.check");
7643 
7644   // Generate the code to check any assumptions that we've made for SCEV
7645   // expressions.
7646   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7647 
7648   // Generate the code that checks at runtime if arrays overlap. We put the
7649   // checks into a separate block to make the more common case of few elements
7650   // faster.
7651   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7652 
7653   // Generate the iteration count check for the main loop, *after* the check
7654   // for the epilogue loop, so that the path-length is shorter for the case
7655   // that goes directly through the vector epilogue. The longer-path length for
7656   // the main loop is compensated for, by the gain from vectorizing the larger
7657   // trip count. Note: the branch will get updated later on when we vectorize
7658   // the epilogue.
7659   EPI.MainLoopIterationCountCheck =
7660       emitIterationCountCheck(LoopScalarPreHeader, false);
7661 
7662   // Generate the induction variable.
7663   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7664 
7665   // Skip induction resume value creation here because they will be created in
7666   // the second pass for the scalar loop. The induction resume values for the
7667   // inductions in the epilogue loop are created before executing the plan for
7668   // the epilogue loop.
7669 
7670   return {completeLoopSkeleton(), nullptr};
7671 }
7672 
7673 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7674   LLVM_DEBUG({
7675     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7676            << "Main Loop VF:" << EPI.MainLoopVF
7677            << ", Main Loop UF:" << EPI.MainLoopUF
7678            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7679            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7680   });
7681 }
7682 
7683 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7684   DEBUG_WITH_TYPE(VerboseDebug, {
7685     dbgs() << "intermediate fn:\n"
7686            << *OrigLoop->getHeader()->getParent() << "\n";
7687   });
7688 }
7689 
7690 BasicBlock *
7691 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7692                                                     bool ForEpilogue) {
7693   assert(Bypass && "Expected valid bypass basic block.");
7694   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7695   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7696   Value *Count = getTripCount();
7697   // Reuse existing vector loop preheader for TC checks.
7698   // Note that new preheader block is generated for vector loop.
7699   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7700   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7701 
7702   // Generate code to check if the loop's trip count is less than VF * UF of the
7703   // main vector loop.
7704   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7705                                                     : VF.isVector())
7706                ? ICmpInst::ICMP_ULE
7707                : ICmpInst::ICMP_ULT;
7708 
7709   Value *CheckMinIters = Builder.CreateICmp(
7710       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7711       "min.iters.check");
7712 
7713   if (!ForEpilogue)
7714     TCCheckBlock->setName("vector.main.loop.iter.check");
7715 
7716   // Create new preheader for vector loop.
7717   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7718                                    DT, LI, nullptr, "vector.ph");
7719 
7720   if (ForEpilogue) {
7721     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7722                                  DT->getNode(Bypass)->getIDom()) &&
7723            "TC check is expected to dominate Bypass");
7724 
7725     // Update dominator for Bypass & LoopExit.
7726     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7727     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7728       // For loops with multiple exits, there's no edge from the middle block
7729       // to exit blocks (as the epilogue must run) and thus no need to update
7730       // the immediate dominator of the exit blocks.
7731       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7732 
7733     LoopBypassBlocks.push_back(TCCheckBlock);
7734 
7735     // Save the trip count so we don't have to regenerate it in the
7736     // vec.epilog.iter.check. This is safe to do because the trip count
7737     // generated here dominates the vector epilog iter check.
7738     EPI.TripCount = Count;
7739   }
7740 
7741   BranchInst &BI =
7742       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7743   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7744     setBranchWeights(BI, MinItersBypassWeights);
7745   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7746 
7747   return TCCheckBlock;
7748 }
7749 
7750 //===--------------------------------------------------------------------===//
7751 // EpilogueVectorizerEpilogueLoop
7752 //===--------------------------------------------------------------------===//
7753 
7754 /// This function is partially responsible for generating the control flow
7755 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7756 std::pair<BasicBlock *, Value *>
7757 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7758     const SCEV2ValueTy &ExpandedSCEVs) {
7759   createVectorLoopSkeleton("vec.epilog.");
7760 
7761   // Now, compare the remaining count and if there aren't enough iterations to
7762   // execute the vectorized epilogue skip to the scalar part.
7763   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7764   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7765   LoopVectorPreHeader =
7766       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7767                  LI, nullptr, "vec.epilog.ph");
7768   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7769                                           VecEpilogueIterationCountCheck);
7770 
7771   // Adjust the control flow taking the state info from the main loop
7772   // vectorization into account.
7773   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7774          "expected this to be saved from the previous pass.");
7775   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7776       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7777 
7778   DT->changeImmediateDominator(LoopVectorPreHeader,
7779                                EPI.MainLoopIterationCountCheck);
7780 
7781   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7782       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7783 
7784   if (EPI.SCEVSafetyCheck)
7785     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7786         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7787   if (EPI.MemSafetyCheck)
7788     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7789         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7790 
7791   DT->changeImmediateDominator(
7792       VecEpilogueIterationCountCheck,
7793       VecEpilogueIterationCountCheck->getSinglePredecessor());
7794 
7795   DT->changeImmediateDominator(LoopScalarPreHeader,
7796                                EPI.EpilogueIterationCountCheck);
7797   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7798     // If there is an epilogue which must run, there's no edge from the
7799     // middle block to exit blocks  and thus no need to update the immediate
7800     // dominator of the exit blocks.
7801     DT->changeImmediateDominator(LoopExitBlock,
7802                                  EPI.EpilogueIterationCountCheck);
7803 
7804   // Keep track of bypass blocks, as they feed start values to the induction and
7805   // reduction phis in the scalar loop preheader.
7806   if (EPI.SCEVSafetyCheck)
7807     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7808   if (EPI.MemSafetyCheck)
7809     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7810   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7811 
7812   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7813   // reductions which merge control-flow from the latch block and the middle
7814   // block. Update the incoming values here and move the Phi into the preheader.
7815   SmallVector<PHINode *, 4> PhisInBlock;
7816   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7817     PhisInBlock.push_back(&Phi);
7818 
7819   for (PHINode *Phi : PhisInBlock) {
7820     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7821     Phi->replaceIncomingBlockWith(
7822         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7823         VecEpilogueIterationCountCheck);
7824 
7825     // If the phi doesn't have an incoming value from the
7826     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7827     // value and also those from other check blocks. This is needed for
7828     // reduction phis only.
7829     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7830           return EPI.EpilogueIterationCountCheck == IncB;
7831         }))
7832       continue;
7833     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7834     if (EPI.SCEVSafetyCheck)
7835       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7836     if (EPI.MemSafetyCheck)
7837       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7838   }
7839 
7840   // Generate a resume induction for the vector epilogue and put it in the
7841   // vector epilogue preheader
7842   Type *IdxTy = Legal->getWidestInductionType();
7843   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7844   EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7845   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7846   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7847                            EPI.MainLoopIterationCountCheck);
7848 
7849   // Generate induction resume values. These variables save the new starting
7850   // indexes for the scalar loop. They are used to test if there are any tail
7851   // iterations left once the vector loop has completed.
7852   // Note that when the vectorized epilogue is skipped due to iteration count
7853   // check, then the resume value for the induction variable comes from
7854   // the trip count of the main vector loop, hence passing the AdditionalBypass
7855   // argument.
7856   createInductionResumeValues(ExpandedSCEVs,
7857                               {VecEpilogueIterationCountCheck,
7858                                EPI.VectorTripCount} /* AdditionalBypass */);
7859 
7860   return {completeLoopSkeleton(), EPResumeVal};
7861 }
7862 
7863 BasicBlock *
7864 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7865     BasicBlock *Bypass, BasicBlock *Insert) {
7866 
7867   assert(EPI.TripCount &&
7868          "Expected trip count to have been safed in the first pass.");
7869   assert(
7870       (!isa<Instruction>(EPI.TripCount) ||
7871        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7872       "saved trip count does not dominate insertion point.");
7873   Value *TC = EPI.TripCount;
7874   IRBuilder<> Builder(Insert->getTerminator());
7875   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7876 
7877   // Generate code to check if the loop's trip count is less than VF * UF of the
7878   // vector epilogue loop.
7879   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7880                ? ICmpInst::ICMP_ULE
7881                : ICmpInst::ICMP_ULT;
7882 
7883   Value *CheckMinIters =
7884       Builder.CreateICmp(P, Count,
7885                          createStepForVF(Builder, Count->getType(),
7886                                          EPI.EpilogueVF, EPI.EpilogueUF),
7887                          "min.epilog.iters.check");
7888 
7889   BranchInst &BI =
7890       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7891   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7892     unsigned MainLoopStep = UF * VF.getKnownMinValue();
7893     unsigned EpilogueLoopStep =
7894         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7895     // We assume the remaining `Count` is equally distributed in
7896     // [0, MainLoopStep)
7897     // So the probability for `Count < EpilogueLoopStep` should be
7898     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7899     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7900     const uint32_t Weights[] = {EstimatedSkipCount,
7901                                 MainLoopStep - EstimatedSkipCount};
7902     setBranchWeights(BI, Weights);
7903   }
7904   ReplaceInstWithInst(Insert->getTerminator(), &BI);
7905 
7906   LoopBypassBlocks.push_back(Insert);
7907   return Insert;
7908 }
7909 
7910 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7911   LLVM_DEBUG({
7912     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7913            << "Epilogue Loop VF:" << EPI.EpilogueVF
7914            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7915   });
7916 }
7917 
7918 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7919   DEBUG_WITH_TYPE(VerboseDebug, {
7920     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7921   });
7922 }
7923 
7924 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7925     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7926   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7927   bool PredicateAtRangeStart = Predicate(Range.Start);
7928 
7929   for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7930     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7931       Range.End = TmpVF;
7932       break;
7933     }
7934 
7935   return PredicateAtRangeStart;
7936 }
7937 
7938 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7939 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7940 /// of VF's starting at a given VF and extending it as much as possible. Each
7941 /// vectorization decision can potentially shorten this sub-range during
7942 /// buildVPlan().
7943 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7944                                            ElementCount MaxVF) {
7945   auto MaxVFTimes2 = MaxVF * 2;
7946   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7947     VFRange SubRange = {VF, MaxVFTimes2};
7948     VPlans.push_back(buildVPlan(SubRange));
7949     VF = SubRange.End;
7950   }
7951 }
7952 
7953 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7954                                          VPlan &Plan) {
7955   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7956 
7957   // Look for cached value.
7958   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7959   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7960   if (ECEntryIt != EdgeMaskCache.end())
7961     return ECEntryIt->second;
7962 
7963   VPValue *SrcMask = getBlockInMask(Src);
7964 
7965   // The terminator has to be a branch inst!
7966   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7967   assert(BI && "Unexpected terminator found");
7968 
7969   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7970     return EdgeMaskCache[Edge] = SrcMask;
7971 
7972   // If source is an exiting block, we know the exit edge is dynamically dead
7973   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
7974   // adding uses of an otherwise potentially dead instruction.
7975   if (OrigLoop->isLoopExiting(Src))
7976     return EdgeMaskCache[Edge] = SrcMask;
7977 
7978   VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
7979   assert(EdgeMask && "No Edge Mask found for condition");
7980 
7981   if (BI->getSuccessor(0) != Dst)
7982     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7983 
7984   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7985     // The condition is 'SrcMask && EdgeMask', which is equivalent to
7986     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7987     // The select version does not introduce new UB if SrcMask is false and
7988     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
7989     VPValue *False = Plan.getVPValueOrAddLiveIn(
7990         ConstantInt::getFalse(BI->getCondition()->getType()));
7991     EdgeMask =
7992         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
7993   }
7994 
7995   return EdgeMaskCache[Edge] = EdgeMask;
7996 }
7997 
7998 void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
7999   BasicBlock *Header = OrigLoop->getHeader();
8000 
8001   // When not folding the tail, use nullptr to model all-true mask.
8002   if (!CM.foldTailByMasking()) {
8003     BlockMaskCache[Header] = nullptr;
8004     return;
8005   }
8006 
8007   // Introduce the early-exit compare IV <= BTC to form header block mask.
8008   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8009   // constructing the desired canonical IV in the header block as its first
8010   // non-phi instructions.
8011 
8012   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8013   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8014   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8015   HeaderVPBB->insert(IV, NewInsertionPoint);
8016 
8017   VPBuilder::InsertPointGuard Guard(Builder);
8018   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8019   VPValue *BlockMask = nullptr;
8020   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8021   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8022   BlockMaskCache[Header] = BlockMask;
8023 }
8024 
8025 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8026   // Return the cached value.
8027   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8028   assert(BCEntryIt != BlockMaskCache.end() &&
8029          "Trying to access mask for block without one.");
8030   return BCEntryIt->second;
8031 }
8032 
8033 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8034   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8035   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8036   assert(OrigLoop->getHeader() != BB &&
8037          "Loop header must have cached block mask");
8038 
8039   // All-one mask is modelled as no-mask following the convention for masked
8040   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8041   VPValue *BlockMask = nullptr;
8042   // This is the block mask. We OR all incoming edges.
8043   for (auto *Predecessor : predecessors(BB)) {
8044     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8045     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8046       BlockMaskCache[BB] = EdgeMask;
8047       return;
8048     }
8049 
8050     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8051       BlockMask = EdgeMask;
8052       continue;
8053     }
8054 
8055     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8056   }
8057 
8058   BlockMaskCache[BB] = BlockMask;
8059 }
8060 
8061 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8062                                                 ArrayRef<VPValue *> Operands,
8063                                                 VFRange &Range,
8064                                                 VPlanPtr &Plan) {
8065   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8066          "Must be called with either a load or store");
8067 
8068   auto willWiden = [&](ElementCount VF) -> bool {
8069     LoopVectorizationCostModel::InstWidening Decision =
8070         CM.getWideningDecision(I, VF);
8071     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8072            "CM decision should be taken at this point.");
8073     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8074       return true;
8075     if (CM.isScalarAfterVectorization(I, VF) ||
8076         CM.isProfitableToScalarize(I, VF))
8077       return false;
8078     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8079   };
8080 
8081   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8082     return nullptr;
8083 
8084   VPValue *Mask = nullptr;
8085   if (Legal->isMaskRequired(I))
8086     Mask = getBlockInMask(I->getParent());
8087 
8088   // Determine if the pointer operand of the access is either consecutive or
8089   // reverse consecutive.
8090   LoopVectorizationCostModel::InstWidening Decision =
8091       CM.getWideningDecision(I, Range.Start);
8092   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8093   bool Consecutive =
8094       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8095 
8096   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8097   if (Consecutive) {
8098     auto *GEP = dyn_cast<GetElementPtrInst>(
8099         Ptr->getUnderlyingValue()->stripPointerCasts());
8100     auto *VectorPtr = new VPVectorPointerRecipe(
8101         Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8102         I->getDebugLoc());
8103     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8104     Ptr = VectorPtr;
8105   }
8106   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8107     return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8108                                               Reverse);
8109 
8110   StoreInst *Store = cast<StoreInst>(I);
8111   return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8112                                             Consecutive, Reverse);
8113 }
8114 
8115 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8116 /// insert a recipe to expand the step for the induction recipe.
8117 static VPWidenIntOrFpInductionRecipe *
8118 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8119                             VPValue *Start, const InductionDescriptor &IndDesc,
8120                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8121                             VFRange &Range) {
8122   assert(IndDesc.getStartValue() ==
8123          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8124   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8125          "step must be loop invariant");
8126 
8127   VPValue *Step =
8128       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8129   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8130     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8131   }
8132   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8133   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8134 }
8135 
8136 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8137     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8138 
8139   // Check if this is an integer or fp induction. If so, build the recipe that
8140   // produces its scalar and vector values.
8141   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8142     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8143                                        *PSE.getSE(), *OrigLoop, Range);
8144 
8145   // Check if this is pointer induction. If so, build the recipe for it.
8146   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8147     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8148                                                            *PSE.getSE());
8149     return new VPWidenPointerInductionRecipe(
8150         Phi, Operands[0], Step, *II,
8151         LoopVectorizationPlanner::getDecisionAndClampRange(
8152             [&](ElementCount VF) {
8153               return CM.isScalarAfterVectorization(Phi, VF);
8154             },
8155             Range));
8156   }
8157   return nullptr;
8158 }
8159 
8160 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8161     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8162   // Optimize the special case where the source is a constant integer
8163   // induction variable. Notice that we can only optimize the 'trunc' case
8164   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8165   // (c) other casts depend on pointer size.
8166 
8167   // Determine whether \p K is a truncation based on an induction variable that
8168   // can be optimized.
8169   auto isOptimizableIVTruncate =
8170       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8171     return [=](ElementCount VF) -> bool {
8172       return CM.isOptimizableIVTruncate(K, VF);
8173     };
8174   };
8175 
8176   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8177           isOptimizableIVTruncate(I), Range)) {
8178 
8179     auto *Phi = cast<PHINode>(I->getOperand(0));
8180     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8181     VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8182     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8183                                        *OrigLoop, Range);
8184   }
8185   return nullptr;
8186 }
8187 
8188 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8189                                                 ArrayRef<VPValue *> Operands,
8190                                                 VPlanPtr &Plan) {
8191   // If all incoming values are equal, the incoming VPValue can be used directly
8192   // instead of creating a new VPBlendRecipe.
8193   if (llvm::all_equal(Operands))
8194     return Operands[0];
8195 
8196   unsigned NumIncoming = Phi->getNumIncomingValues();
8197   // For in-loop reductions, we do not need to create an additional select.
8198   VPValue *InLoopVal = nullptr;
8199   for (unsigned In = 0; In < NumIncoming; In++) {
8200     PHINode *PhiOp =
8201         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8202     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8203       assert(!InLoopVal && "Found more than one in-loop reduction!");
8204       InLoopVal = Operands[In];
8205     }
8206   }
8207 
8208   assert((!InLoopVal || NumIncoming == 2) &&
8209          "Found an in-loop reduction for PHI with unexpected number of "
8210          "incoming values");
8211   if (InLoopVal)
8212     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8213 
8214   // We know that all PHIs in non-header blocks are converted into selects, so
8215   // we don't have to worry about the insertion order and we can just use the
8216   // builder. At this point we generate the predication tree. There may be
8217   // duplications since this is a simple recursive scan, but future
8218   // optimizations will clean it up.
8219   SmallVector<VPValue *, 2> OperandsWithMask;
8220 
8221   for (unsigned In = 0; In < NumIncoming; In++) {
8222     VPValue *EdgeMask =
8223         createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
8224     assert((EdgeMask || NumIncoming == 1) &&
8225            "Multiple predecessors with one having a full mask");
8226     OperandsWithMask.push_back(Operands[In]);
8227     if (EdgeMask)
8228       OperandsWithMask.push_back(EdgeMask);
8229   }
8230   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8231 }
8232 
8233 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8234                                                    ArrayRef<VPValue *> Operands,
8235                                                    VFRange &Range,
8236                                                    VPlanPtr &Plan) {
8237   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8238       [this, CI](ElementCount VF) {
8239         return CM.isScalarWithPredication(CI, VF);
8240       },
8241       Range);
8242 
8243   if (IsPredicated)
8244     return nullptr;
8245 
8246   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8247   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8248              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8249              ID == Intrinsic::pseudoprobe ||
8250              ID == Intrinsic::experimental_noalias_scope_decl))
8251     return nullptr;
8252 
8253   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8254 
8255   // Is it beneficial to perform intrinsic call compared to lib call?
8256   bool ShouldUseVectorIntrinsic =
8257       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8258                 [&](ElementCount VF) -> bool {
8259                   return CM.getCallWideningDecision(CI, VF).Kind ==
8260                          LoopVectorizationCostModel::CM_IntrinsicCall;
8261                 },
8262                 Range);
8263   if (ShouldUseVectorIntrinsic)
8264     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8265                                  CI->getDebugLoc());
8266 
8267   Function *Variant = nullptr;
8268   std::optional<unsigned> MaskPos;
8269   // Is better to call a vectorized version of the function than to to scalarize
8270   // the call?
8271   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8272       [&](ElementCount VF) -> bool {
8273         // The following case may be scalarized depending on the VF.
8274         // The flag shows whether we can use a usual Call for vectorized
8275         // version of the instruction.
8276 
8277         // If we've found a variant at a previous VF, then stop looking. A
8278         // vectorized variant of a function expects input in a certain shape
8279         // -- basically the number of input registers, the number of lanes
8280         // per register, and whether there's a mask required.
8281         // We store a pointer to the variant in the VPWidenCallRecipe, so
8282         // once we have an appropriate variant it's only valid for that VF.
8283         // This will force a different vplan to be generated for each VF that
8284         // finds a valid variant.
8285         if (Variant)
8286           return false;
8287         LoopVectorizationCostModel::CallWideningDecision Decision =
8288             CM.getCallWideningDecision(CI, VF);
8289         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8290           Variant = Decision.Variant;
8291           MaskPos = Decision.MaskPos;
8292           return true;
8293         }
8294 
8295         return false;
8296       },
8297       Range);
8298   if (ShouldUseVectorCall) {
8299     if (MaskPos.has_value()) {
8300       // We have 2 cases that would require a mask:
8301       //   1) The block needs to be predicated, either due to a conditional
8302       //      in the scalar loop or use of an active lane mask with
8303       //      tail-folding, and we use the appropriate mask for the block.
8304       //   2) No mask is required for the block, but the only available
8305       //      vector variant at this VF requires a mask, so we synthesize an
8306       //      all-true mask.
8307       VPValue *Mask = nullptr;
8308       if (Legal->isMaskRequired(CI))
8309         Mask = getBlockInMask(CI->getParent());
8310       else
8311         Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8312             IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8313 
8314       Ops.insert(Ops.begin() + *MaskPos, Mask);
8315     }
8316 
8317     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8318                                  Intrinsic::not_intrinsic, CI->getDebugLoc(),
8319                                  Variant);
8320   }
8321 
8322   return nullptr;
8323 }
8324 
8325 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8326   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8327          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8328   // Instruction should be widened, unless it is scalar after vectorization,
8329   // scalarization is profitable or it is predicated.
8330   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8331     return CM.isScalarAfterVectorization(I, VF) ||
8332            CM.isProfitableToScalarize(I, VF) ||
8333            CM.isScalarWithPredication(I, VF);
8334   };
8335   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8336                                                              Range);
8337 }
8338 
8339 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8340                                           ArrayRef<VPValue *> Operands,
8341                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
8342   switch (I->getOpcode()) {
8343   default:
8344     return nullptr;
8345   case Instruction::SDiv:
8346   case Instruction::UDiv:
8347   case Instruction::SRem:
8348   case Instruction::URem: {
8349     // If not provably safe, use a select to form a safe divisor before widening the
8350     // div/rem operation itself.  Otherwise fall through to general handling below.
8351     if (CM.isPredicatedInst(I)) {
8352       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8353       VPValue *Mask = getBlockInMask(I->getParent());
8354       VPValue *One = Plan->getVPValueOrAddLiveIn(
8355           ConstantInt::get(I->getType(), 1u, false));
8356       auto *SafeRHS =
8357          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8358                            I->getDebugLoc());
8359       VPBB->appendRecipe(SafeRHS);
8360       Ops[1] = SafeRHS;
8361       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8362     }
8363     [[fallthrough]];
8364   }
8365   case Instruction::Add:
8366   case Instruction::And:
8367   case Instruction::AShr:
8368   case Instruction::FAdd:
8369   case Instruction::FCmp:
8370   case Instruction::FDiv:
8371   case Instruction::FMul:
8372   case Instruction::FNeg:
8373   case Instruction::FRem:
8374   case Instruction::FSub:
8375   case Instruction::ICmp:
8376   case Instruction::LShr:
8377   case Instruction::Mul:
8378   case Instruction::Or:
8379   case Instruction::Select:
8380   case Instruction::Shl:
8381   case Instruction::Sub:
8382   case Instruction::Xor:
8383   case Instruction::Freeze:
8384     return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8385   };
8386 }
8387 
8388 void VPRecipeBuilder::fixHeaderPhis() {
8389   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8390   for (VPHeaderPHIRecipe *R : PhisToFix) {
8391     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8392     VPRecipeBase *IncR =
8393         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8394     R->addOperand(IncR->getVPSingleValue());
8395   }
8396 }
8397 
8398 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
8399                                                        VFRange &Range,
8400                                                        VPlan &Plan) {
8401   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8402       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8403       Range);
8404 
8405   bool IsPredicated = CM.isPredicatedInst(I);
8406 
8407   // Even if the instruction is not marked as uniform, there are certain
8408   // intrinsic calls that can be effectively treated as such, so we check for
8409   // them here. Conservatively, we only do this for scalable vectors, since
8410   // for fixed-width VFs we can always fall back on full scalarization.
8411   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8412     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8413     case Intrinsic::assume:
8414     case Intrinsic::lifetime_start:
8415     case Intrinsic::lifetime_end:
8416       // For scalable vectors if one of the operands is variant then we still
8417       // want to mark as uniform, which will generate one instruction for just
8418       // the first lane of the vector. We can't scalarize the call in the same
8419       // way as for fixed-width vectors because we don't know how many lanes
8420       // there are.
8421       //
8422       // The reasons for doing it this way for scalable vectors are:
8423       //   1. For the assume intrinsic generating the instruction for the first
8424       //      lane is still be better than not generating any at all. For
8425       //      example, the input may be a splat across all lanes.
8426       //   2. For the lifetime start/end intrinsics the pointer operand only
8427       //      does anything useful when the input comes from a stack object,
8428       //      which suggests it should always be uniform. For non-stack objects
8429       //      the effect is to poison the object, which still allows us to
8430       //      remove the call.
8431       IsUniform = true;
8432       break;
8433     default:
8434       break;
8435     }
8436   }
8437   VPValue *BlockInMask = nullptr;
8438   if (!IsPredicated) {
8439     // Finalize the recipe for Instr, first if it is not predicated.
8440     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8441   } else {
8442     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8443     // Instructions marked for predication are replicated and a mask operand is
8444     // added initially. Masked replicate recipes will later be placed under an
8445     // if-then construct to prevent side-effects. Generate recipes to compute
8446     // the block mask for this region.
8447     BlockInMask = getBlockInMask(I->getParent());
8448   }
8449 
8450   auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
8451                                        IsUniform, BlockInMask);
8452   return toVPRecipeResult(Recipe);
8453 }
8454 
8455 VPRecipeOrVPValueTy
8456 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8457                                         ArrayRef<VPValue *> Operands,
8458                                         VFRange &Range, VPBasicBlock *VPBB,
8459                                         VPlanPtr &Plan) {
8460   // First, check for specific widening recipes that deal with inductions, Phi
8461   // nodes, calls and memory operations.
8462   VPRecipeBase *Recipe;
8463   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8464     if (Phi->getParent() != OrigLoop->getHeader())
8465       return tryToBlend(Phi, Operands, Plan);
8466 
8467     // Always record recipes for header phis. Later first-order recurrence phis
8468     // can have earlier phis as incoming values.
8469     recordRecipeOf(Phi);
8470 
8471     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8472       return toVPRecipeResult(Recipe);
8473 
8474     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8475     assert((Legal->isReductionVariable(Phi) ||
8476             Legal->isFixedOrderRecurrence(Phi)) &&
8477            "can only widen reductions and fixed-order recurrences here");
8478     VPValue *StartV = Operands[0];
8479     if (Legal->isReductionVariable(Phi)) {
8480       const RecurrenceDescriptor &RdxDesc =
8481           Legal->getReductionVars().find(Phi)->second;
8482       assert(RdxDesc.getRecurrenceStartValue() ==
8483              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8484       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8485                                            CM.isInLoopReduction(Phi),
8486                                            CM.useOrderedReductions(RdxDesc));
8487     } else {
8488       // TODO: Currently fixed-order recurrences are modeled as chains of
8489       // first-order recurrences. If there are no users of the intermediate
8490       // recurrences in the chain, the fixed order recurrence should be modeled
8491       // directly, enabling more efficient codegen.
8492       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8493     }
8494 
8495     // Record the incoming value from the backedge, so we can add the incoming
8496     // value from the backedge after all recipes have been created.
8497     auto *Inc = cast<Instruction>(
8498         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8499     auto RecipeIter = Ingredient2Recipe.find(Inc);
8500     if (RecipeIter == Ingredient2Recipe.end())
8501       recordRecipeOf(Inc);
8502 
8503     PhisToFix.push_back(PhiRecipe);
8504     return toVPRecipeResult(PhiRecipe);
8505   }
8506 
8507   if (isa<TruncInst>(Instr) &&
8508       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8509                                                Range, *Plan)))
8510     return toVPRecipeResult(Recipe);
8511 
8512   // All widen recipes below deal only with VF > 1.
8513   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8514           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8515     return nullptr;
8516 
8517   if (auto *CI = dyn_cast<CallInst>(Instr))
8518     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
8519 
8520   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8521     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8522 
8523   if (!shouldWiden(Instr, Range))
8524     return nullptr;
8525 
8526   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8527     return toVPRecipeResult(new VPWidenGEPRecipe(
8528         GEP, make_range(Operands.begin(), Operands.end())));
8529 
8530   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8531     return toVPRecipeResult(new VPWidenSelectRecipe(
8532         *SI, make_range(Operands.begin(), Operands.end())));
8533   }
8534 
8535   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8536     return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
8537                                                   CI->getType(), *CI));
8538   }
8539 
8540   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8541 }
8542 
8543 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8544                                                         ElementCount MaxVF) {
8545   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8546 
8547   auto MaxVFTimes2 = MaxVF * 2;
8548   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8549     VFRange SubRange = {VF, MaxVFTimes2};
8550     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8551       // Now optimize the initial VPlan.
8552       if (!Plan->hasVF(ElementCount::getFixed(1)))
8553         VPlanTransforms::truncateToMinimalBitwidths(
8554             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8555       VPlanTransforms::optimize(*Plan, *PSE.getSE());
8556       assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
8557       VPlans.push_back(std::move(Plan));
8558     }
8559     VF = SubRange.End;
8560   }
8561 }
8562 
8563 // Add the necessary canonical IV and branch recipes required to control the
8564 // loop.
8565 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8566                                   DebugLoc DL) {
8567   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8568   auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
8569 
8570   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8571   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8572   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8573   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8574   Header->insert(CanonicalIVPHI, Header->begin());
8575 
8576   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8577   // IV by VF * UF.
8578   auto *CanonicalIVIncrement =
8579       new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
8580                         {HasNUW, false}, DL, "index.next");
8581   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8582 
8583   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8584   EB->appendRecipe(CanonicalIVIncrement);
8585 
8586   // Add the BranchOnCount VPInstruction to the latch.
8587   VPInstruction *BranchBack =
8588       new VPInstruction(VPInstruction::BranchOnCount,
8589                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8590   EB->appendRecipe(BranchBack);
8591 }
8592 
8593 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8594 // original exit block.
8595 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8596                                 VPlan &Plan) {
8597   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8598   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8599   // Only handle single-exit loops with unique exit blocks for now.
8600   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8601     return;
8602 
8603   // Introduce VPUsers modeling the exit values.
8604   for (PHINode &ExitPhi : ExitBB->phis()) {
8605     Value *IncomingValue =
8606         ExitPhi.getIncomingValueForBlock(ExitingBB);
8607     VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
8608     Plan.addLiveOut(&ExitPhi, V);
8609   }
8610 }
8611 
8612 VPlanPtr
8613 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8614 
8615   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8616 
8617   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8618 
8619   // ---------------------------------------------------------------------------
8620   // Pre-construction: record ingredients whose recipes we'll need to further
8621   // process after constructing the initial VPlan.
8622   // ---------------------------------------------------------------------------
8623 
8624   // For each interleave group which is relevant for this (possibly trimmed)
8625   // Range, add it to the set of groups to be later applied to the VPlan and add
8626   // placeholders for its members' Recipes which we'll be replacing with a
8627   // single VPInterleaveRecipe.
8628   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8629     auto applyIG = [IG, this](ElementCount VF) -> bool {
8630       bool Result = (VF.isVector() && // Query is illegal for VF == 1
8631                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
8632                          LoopVectorizationCostModel::CM_Interleave);
8633       // For scalable vectors, the only interleave factor currently supported
8634       // is 2 since we require the (de)interleave2 intrinsics instead of
8635       // shufflevectors.
8636       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8637              "Unsupported interleave factor for scalable vectors");
8638       return Result;
8639     };
8640     if (!getDecisionAndClampRange(applyIG, Range))
8641       continue;
8642     InterleaveGroups.insert(IG);
8643     for (unsigned i = 0; i < IG->getFactor(); i++)
8644       if (Instruction *Member = IG->getMember(i))
8645         RecipeBuilder.recordRecipeOf(Member);
8646   };
8647 
8648   // ---------------------------------------------------------------------------
8649   // Build initial VPlan: Scan the body of the loop in a topological order to
8650   // visit each basic block after having visited its predecessor basic blocks.
8651   // ---------------------------------------------------------------------------
8652 
8653   // Create initial VPlan skeleton, having a basic block for the pre-header
8654   // which contains SCEV expansions that need to happen before the CFG is
8655   // modified; a basic block for the vector pre-header, followed by a region for
8656   // the vector loop, followed by the middle basic block. The skeleton vector
8657   // loop region contains a header and latch basic blocks.
8658   VPlanPtr Plan = VPlan::createInitialVPlan(
8659       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8660       *PSE.getSE());
8661   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8662   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8663   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8664   Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8665   Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8666 
8667   // Don't use getDecisionAndClampRange here, because we don't know the UF
8668   // so this function is better to be conservative, rather than to split
8669   // it up into different VPlans.
8670   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8671   bool IVUpdateMayOverflow = false;
8672   for (ElementCount VF : Range)
8673     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8674 
8675   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8676   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8677   // When not folding the tail, we know that the induction increment will not
8678   // overflow.
8679   bool HasNUW = Style == TailFoldingStyle::None;
8680   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8681 
8682   // Scan the body of the loop in a topological order to visit each basic block
8683   // after having visited its predecessor basic blocks.
8684   LoopBlocksDFS DFS(OrigLoop);
8685   DFS.perform(LI);
8686 
8687   VPBasicBlock *VPBB = HeaderVPBB;
8688   bool NeedsMasks = CM.foldTailByMasking() ||
8689                     any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
8690                       return Legal->blockNeedsPredication(BB);
8691                     });
8692   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8693     // Relevant instructions from basic block BB will be grouped into VPRecipe
8694     // ingredients and fill a new VPBasicBlock.
8695     if (VPBB != HeaderVPBB)
8696       VPBB->setName(BB->getName());
8697     Builder.setInsertPoint(VPBB);
8698 
8699     if (VPBB == HeaderVPBB)
8700       RecipeBuilder.createHeaderMask(*Plan);
8701     else if (NeedsMasks)
8702       RecipeBuilder.createBlockInMask(BB, *Plan);
8703 
8704     // Introduce each ingredient into VPlan.
8705     // TODO: Model and preserve debug intrinsics in VPlan.
8706     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8707       Instruction *Instr = &I;
8708       SmallVector<VPValue *, 4> Operands;
8709       auto *Phi = dyn_cast<PHINode>(Instr);
8710       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8711         Operands.push_back(Plan->getVPValueOrAddLiveIn(
8712             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8713       } else {
8714         auto OpRange = Plan->mapToVPValues(Instr->operands());
8715         Operands = {OpRange.begin(), OpRange.end()};
8716       }
8717 
8718       // Invariant stores inside loop will be deleted and a single store
8719       // with the final reduction value will be added to the exit block
8720       StoreInst *SI;
8721       if ((SI = dyn_cast<StoreInst>(&I)) &&
8722           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8723         continue;
8724 
8725       auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8726           Instr, Operands, Range, VPBB, Plan);
8727       if (!RecipeOrValue)
8728         RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
8729       // If Instr can be simplified to an existing VPValue, use it.
8730       if (isa<VPValue *>(RecipeOrValue)) {
8731         auto *VPV = cast<VPValue *>(RecipeOrValue);
8732         Plan->addVPValue(Instr, VPV);
8733         // If the re-used value is a recipe, register the recipe for the
8734         // instruction, in case the recipe for Instr needs to be recorded.
8735         if (VPRecipeBase *R = VPV->getDefiningRecipe())
8736           RecipeBuilder.setRecipe(Instr, R);
8737         continue;
8738       }
8739       // Otherwise, add the new recipe.
8740       VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
8741       for (auto *Def : Recipe->definedValues()) {
8742         auto *UV = Def->getUnderlyingValue();
8743         Plan->addVPValue(UV, Def);
8744       }
8745 
8746       RecipeBuilder.setRecipe(Instr, Recipe);
8747       if (isa<VPHeaderPHIRecipe>(Recipe)) {
8748         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8749         // the following cases, VPHeaderPHIRecipes may be created after non-phi
8750         // recipes and need to be moved to the phi section of HeaderVPBB:
8751         // * tail-folding (non-phi recipes computing the header mask are
8752         // introduced earlier than regular header phi recipes, and should appear
8753         // after them)
8754         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8755 
8756         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8757                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8758                "unexpected recipe needs moving");
8759         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8760       } else
8761         VPBB->appendRecipe(Recipe);
8762     }
8763 
8764     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8765     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8766   }
8767 
8768   // After here, VPBB should not be used.
8769   VPBB = nullptr;
8770 
8771   if (CM.requiresScalarEpilogue(Range)) {
8772     // No edge from the middle block to the unique exit block has been inserted
8773     // and there is nothing to fix from vector loop; phis should have incoming
8774     // from scalar loop only.
8775   } else
8776     addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
8777 
8778   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8779          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8780          "entry block must be set to a VPRegionBlock having a non-empty entry "
8781          "VPBasicBlock");
8782   RecipeBuilder.fixHeaderPhis();
8783 
8784   // ---------------------------------------------------------------------------
8785   // Transform initial VPlan: Apply previously taken decisions, in order, to
8786   // bring the VPlan to its final state.
8787   // ---------------------------------------------------------------------------
8788 
8789   // Adjust the recipes for any inloop reductions.
8790   adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8791 
8792   // Interleave memory: for each Interleave Group we marked earlier as relevant
8793   // for this VPlan, replace the Recipes widening its memory instructions with a
8794   // single VPInterleaveRecipe at its insertion point.
8795   for (const auto *IG : InterleaveGroups) {
8796     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8797         RecipeBuilder.getRecipe(IG->getInsertPos()));
8798     SmallVector<VPValue *, 4> StoredValues;
8799     for (unsigned i = 0; i < IG->getFactor(); ++i)
8800       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8801         auto *StoreR =
8802             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8803         StoredValues.push_back(StoreR->getStoredValue());
8804       }
8805 
8806     bool NeedsMaskForGaps =
8807         IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8808     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8809                                         Recipe->getMask(), NeedsMaskForGaps);
8810     VPIG->insertBefore(Recipe);
8811     unsigned J = 0;
8812     for (unsigned i = 0; i < IG->getFactor(); ++i)
8813       if (Instruction *Member = IG->getMember(i)) {
8814         VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8815         if (!Member->getType()->isVoidTy()) {
8816           VPValue *OriginalV = MemberR->getVPSingleValue();
8817           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8818           J++;
8819         }
8820         MemberR->eraseFromParent();
8821       }
8822   }
8823 
8824   for (ElementCount VF : Range)
8825     Plan->addVF(VF);
8826   Plan->setName("Initial VPlan");
8827 
8828   // Replace VPValues for known constant strides guaranteed by predicate scalar
8829   // evolution.
8830   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8831     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8832     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8833     // Only handle constant strides for now.
8834     if (!ScevStride)
8835       continue;
8836     Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8837 
8838     auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
8839     // The versioned value may not be used in the loop directly, so just add a
8840     // new live-in in those cases.
8841     Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8842   }
8843 
8844   // From this point onwards, VPlan-to-VPlan transformations may change the plan
8845   // in ways that accessing values using original IR values is incorrect.
8846   Plan->disableValue2VPValue();
8847 
8848   // Sink users of fixed-order recurrence past the recipe defining the previous
8849   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8850   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8851     return nullptr;
8852 
8853   if (useActiveLaneMask(Style)) {
8854     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8855     // TailFoldingStyle is visible there.
8856     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8857     bool WithoutRuntimeCheck =
8858         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8859     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8860                                        WithoutRuntimeCheck);
8861   }
8862   return Plan;
8863 }
8864 
8865 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8866   // Outer loop handling: They may require CFG and instruction level
8867   // transformations before even evaluating whether vectorization is profitable.
8868   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8869   // the vectorization pipeline.
8870   assert(!OrigLoop->isInnermost());
8871   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8872 
8873   // Create new empty VPlan
8874   auto Plan = VPlan::createInitialVPlan(
8875       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8876       *PSE.getSE());
8877 
8878   // Build hierarchical CFG
8879   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8880   HCFGBuilder.buildHierarchicalCFG();
8881 
8882   for (ElementCount VF : Range)
8883     Plan->addVF(VF);
8884 
8885   VPlanTransforms::VPInstructionsToVPRecipes(
8886       Plan,
8887       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8888       *PSE.getSE(), *TLI);
8889 
8890   // Remove the existing terminator of the exiting block of the top-most region.
8891   // A BranchOnCount will be added instead when adding the canonical IV recipes.
8892   auto *Term =
8893       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8894   Term->eraseFromParent();
8895 
8896   // Tail folding is not supported for outer loops, so the induction increment
8897   // is guaranteed to not wrap.
8898   bool HasNUW = true;
8899   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8900                         DebugLoc());
8901   return Plan;
8902 }
8903 
8904 // Adjust the recipes for reductions. For in-loop reductions the chain of
8905 // instructions leading from the loop exit instr to the phi need to be converted
8906 // to reductions, with one operand being vector and the other being the scalar
8907 // reduction chain. For other reductions, a select is introduced between the phi
8908 // and live-out recipes when folding the tail.
8909 //
8910 // A ComputeReductionResult recipe is added to the middle block, also for
8911 // in-loop reductions which compute their result in-loop, because generating
8912 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8913 void LoopVectorizationPlanner::adjustRecipesForReductions(
8914     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8915     ElementCount MinVF) {
8916   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8917   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8918   // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8919   // sank outside of the loop would keep the same order as they had in the
8920   // original loop.
8921   SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8922   for (VPRecipeBase &R : Header->phis()) {
8923     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8924       ReductionPHIList.emplace_back(ReductionPhi);
8925   }
8926   bool HasIntermediateStore = false;
8927   stable_sort(ReductionPHIList,
8928               [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8929                                             const VPReductionPHIRecipe *R2) {
8930                 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8931                 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8932                 HasIntermediateStore |= IS1 || IS2;
8933 
8934                 // If neither of the recipes has an intermediate store, keep the
8935                 // order the same.
8936                 if (!IS1 && !IS2)
8937                   return false;
8938 
8939                 // If only one of the recipes has an intermediate store, then
8940                 // move it towards the beginning of the list.
8941                 if (IS1 && !IS2)
8942                   return true;
8943 
8944                 if (!IS1 && IS2)
8945                   return false;
8946 
8947                 // If both recipes have an intermediate store, then the recipe
8948                 // with the later store should be processed earlier. So it
8949                 // should go to the beginning of the list.
8950                 return DT->dominates(IS2, IS1);
8951               });
8952 
8953   if (HasIntermediateStore && ReductionPHIList.size() > 1)
8954     for (VPRecipeBase *R : ReductionPHIList)
8955       R->moveBefore(*Header, Header->getFirstNonPhi());
8956 
8957   for (VPRecipeBase &R : Header->phis()) {
8958     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8959     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8960       continue;
8961 
8962     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8963     RecurKind Kind = RdxDesc.getRecurrenceKind();
8964     assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8965            "AnyOf reductions are not allowed for in-loop reductions");
8966 
8967     // Collect the chain of "link" recipes for the reduction starting at PhiR.
8968     SetVector<VPSingleDefRecipe *> Worklist;
8969     Worklist.insert(PhiR);
8970     for (unsigned I = 0; I != Worklist.size(); ++I) {
8971       VPSingleDefRecipe *Cur = Worklist[I];
8972       for (VPUser *U : Cur->users()) {
8973         auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8974         if (!UserRecipe) {
8975           assert(isa<VPLiveOut>(U) &&
8976                  "U must either be a VPSingleDef or VPLiveOut");
8977           continue;
8978         }
8979         Worklist.insert(UserRecipe);
8980       }
8981     }
8982 
8983     // Visit operation "Links" along the reduction chain top-down starting from
8984     // the phi until LoopExitValue. We keep track of the previous item
8985     // (PreviousLink) to tell which of the two operands of a Link will remain
8986     // scalar and which will be reduced. For minmax by select(cmp), Link will be
8987     // the select instructions.
8988     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8989     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8990       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8991 
8992       // Index of the first operand which holds a non-mask vector operand.
8993       unsigned IndexOfFirstOperand;
8994       // Recognize a call to the llvm.fmuladd intrinsic.
8995       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8996       VPValue *VecOp;
8997       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8998       if (IsFMulAdd) {
8999         assert(
9000             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9001             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9002         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9003                 isa<VPWidenCallRecipe>(CurrentLink)) &&
9004                CurrentLink->getOperand(2) == PreviousLink &&
9005                "expected a call where the previous link is the added operand");
9006 
9007         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9008         // need to create an fmul recipe (multiplying the first two operands of
9009         // the fmuladd together) to use as the vector operand for the fadd
9010         // reduction.
9011         VPInstruction *FMulRecipe = new VPInstruction(
9012             Instruction::FMul,
9013             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9014             CurrentLinkI->getFastMathFlags());
9015         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9016         VecOp = FMulRecipe;
9017       } else {
9018         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9019           if (isa<VPWidenRecipe>(CurrentLink)) {
9020             assert(isa<CmpInst>(CurrentLinkI) &&
9021                    "need to have the compare of the select");
9022             continue;
9023           }
9024           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9025                  "must be a select recipe");
9026           IndexOfFirstOperand = 1;
9027         } else {
9028           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9029                  "Expected to replace a VPWidenSC");
9030           IndexOfFirstOperand = 0;
9031         }
9032         // Note that for non-commutable operands (cmp-selects), the semantics of
9033         // the cmp-select are captured in the recurrence kind.
9034         unsigned VecOpId =
9035             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9036                 ? IndexOfFirstOperand + 1
9037                 : IndexOfFirstOperand;
9038         VecOp = CurrentLink->getOperand(VecOpId);
9039         assert(VecOp != PreviousLink &&
9040                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9041                                        (VecOpId - IndexOfFirstOperand)) ==
9042                    PreviousLink &&
9043                "PreviousLink must be the operand other than VecOp");
9044       }
9045 
9046       BasicBlock *BB = CurrentLinkI->getParent();
9047       VPValue *CondOp = nullptr;
9048       if (CM.blockNeedsPredicationForAnyReason(BB)) {
9049         VPBuilder::InsertPointGuard Guard(Builder);
9050         Builder.setInsertPoint(CurrentLink);
9051         CondOp = RecipeBuilder.getBlockInMask(BB);
9052       }
9053 
9054       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9055           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);
9056       // Append the recipe to the end of the VPBasicBlock because we need to
9057       // ensure that it comes after all of it's inputs, including CondOp.
9058       // Note that this transformation may leave over dead recipes (including
9059       // CurrentLink), which will be cleaned by a later VPlan transform.
9060       LinkVPBB->appendRecipe(RedRecipe);
9061       CurrentLink->replaceAllUsesWith(RedRecipe);
9062       PreviousLink = RedRecipe;
9063     }
9064   }
9065   Builder.setInsertPoint(&*LatchVPBB->begin());
9066   for (VPRecipeBase &R :
9067        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9068     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9069     if (!PhiR)
9070       continue;
9071 
9072     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9073     // If tail is folded by masking, introduce selects between the phi
9074     // and the live-out instruction of each reduction, at the beginning of the
9075     // dedicated latch block.
9076     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9077     auto *NewExitingVPV = PhiR->getBackedgeValue();
9078     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9079       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9080       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9081              "reduction recipe must be defined before latch");
9082       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9083       std::optional<FastMathFlags> FMFs =
9084           PhiTy->isFloatingPointTy()
9085               ? std::make_optional(RdxDesc.getFastMathFlags())
9086               : std::nullopt;
9087       NewExitingVPV =
9088           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9089       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9090         return isa<VPInstruction>(&U) &&
9091                cast<VPInstruction>(&U)->getOpcode() ==
9092                    VPInstruction::ComputeReductionResult;
9093       });
9094       if (PreferPredicatedReductionSelect ||
9095           TTI.preferPredicatedReductionSelect(
9096               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9097               TargetTransformInfo::ReductionFlags()))
9098         PhiR->setOperand(1, NewExitingVPV);
9099     }
9100 
9101     // If the vector reduction can be performed in a smaller type, we truncate
9102     // then extend the loop exit value to enable InstCombine to evaluate the
9103     // entire expression in the smaller type.
9104     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9105     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9106       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9107       Type *RdxTy = RdxDesc.getRecurrenceType();
9108       auto *Trunc =
9109           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9110       auto *Extnd =
9111           RdxDesc.isSigned()
9112               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9113               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9114 
9115       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9116       Extnd->insertAfter(Trunc);
9117       if (PhiR->getOperand(1) == NewExitingVPV)
9118         PhiR->setOperand(1, Extnd->getVPSingleValue());
9119       NewExitingVPV = Extnd;
9120     }
9121 
9122     // We want code in the middle block to appear to execute on the location of
9123     // the scalar loop's latch terminator because: (a) it is all compiler
9124     // generated, (b) these instructions are always executed after evaluating
9125     // the latch conditional branch, and (c) other passes may add new
9126     // predecessors which terminate on this line. This is the easiest way to
9127     // ensure we don't accidentally cause an extra step back into the loop while
9128     // debugging.
9129     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9130 
9131     // TODO: At the moment ComputeReductionResult also drives creation of the
9132     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9133     // even for in-loop reductions, until the reduction resume value handling is
9134     // also modeled in VPlan.
9135     auto *FinalReductionResult = new VPInstruction(
9136         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9137     cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9138         ->appendRecipe(FinalReductionResult);
9139     OrigExitingVPV->replaceUsesWithIf(
9140         FinalReductionResult,
9141         [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9142   }
9143 
9144   VPlanTransforms::clearReductionWrapFlags(*Plan);
9145 }
9146 
9147 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9148 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9149                                VPSlotTracker &SlotTracker) const {
9150   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9151   IG->getInsertPos()->printAsOperand(O, false);
9152   O << ", ";
9153   getAddr()->printAsOperand(O, SlotTracker);
9154   VPValue *Mask = getMask();
9155   if (Mask) {
9156     O << ", ";
9157     Mask->printAsOperand(O, SlotTracker);
9158   }
9159 
9160   unsigned OpIdx = 0;
9161   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9162     if (!IG->getMember(i))
9163       continue;
9164     if (getNumStoreOperands() > 0) {
9165       O << "\n" << Indent << "  store ";
9166       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9167       O << " to index " << i;
9168     } else {
9169       O << "\n" << Indent << "  ";
9170       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9171       O << " = load from index " << i;
9172     }
9173     ++OpIdx;
9174   }
9175 }
9176 #endif
9177 
9178 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9179   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9180          "Not a pointer induction according to InductionDescriptor!");
9181   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9182          "Unexpected type.");
9183 
9184   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9185   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9186 
9187   if (onlyScalarsGenerated(State.VF)) {
9188     // This is the normalized GEP that starts counting at zero.
9189     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9190         CanonicalIV, IndDesc.getStep()->getType());
9191     // Determine the number of scalars we need to generate for each unroll
9192     // iteration. If the instruction is uniform, we only need to generate the
9193     // first lane. Otherwise, we generate all VF values.
9194     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9195     assert((IsUniform || !State.VF.isScalable()) &&
9196            "Cannot scalarize a scalable VF");
9197     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9198 
9199     for (unsigned Part = 0; Part < State.UF; ++Part) {
9200       Value *PartStart =
9201           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9202 
9203       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9204         Value *Idx = State.Builder.CreateAdd(
9205             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9206         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9207 
9208         Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
9209         Value *SclrGep = emitTransformedIndex(
9210             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9211             IndDesc.getKind(), IndDesc.getInductionBinOp());
9212         SclrGep->setName("next.gep");
9213         State.set(this, SclrGep, VPIteration(Part, Lane));
9214       }
9215     }
9216     return;
9217   }
9218 
9219   Type *PhiType = IndDesc.getStep()->getType();
9220 
9221   // Build a pointer phi
9222   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9223   Type *ScStValueType = ScalarStartValue->getType();
9224   PHINode *NewPointerPhi =
9225       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9226 
9227   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9228   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9229 
9230   // A pointer induction, performed by using a gep
9231   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9232 
9233   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9234   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9235   Value *NumUnrolledElems =
9236       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9237   Value *InductionGEP = GetElementPtrInst::Create(
9238       State.Builder.getInt8Ty(), NewPointerPhi,
9239       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9240       InductionLoc);
9241   // Add induction update using an incorrect block temporarily. The phi node
9242   // will be fixed after VPlan execution. Note that at this point the latch
9243   // block cannot be used, as it does not exist yet.
9244   // TODO: Model increment value in VPlan, by turning the recipe into a
9245   // multi-def and a subclass of VPHeaderPHIRecipe.
9246   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9247 
9248   // Create UF many actual address geps that use the pointer
9249   // phi as base and a vectorized version of the step value
9250   // (<step*0, ..., step*N>) as offset.
9251   for (unsigned Part = 0; Part < State.UF; ++Part) {
9252     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9253     Value *StartOffsetScalar =
9254         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9255     Value *StartOffset =
9256         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9257     // Create a vector of consecutive numbers from zero to VF.
9258     StartOffset = State.Builder.CreateAdd(
9259         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9260 
9261     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9262            "scalar step must be the same across all parts");
9263     Value *GEP = State.Builder.CreateGEP(
9264         State.Builder.getInt8Ty(), NewPointerPhi,
9265         State.Builder.CreateMul(
9266             StartOffset,
9267             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9268             "vector.gep"));
9269     State.set(this, GEP, Part);
9270   }
9271 }
9272 
9273 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9274   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9275 
9276   // Fast-math-flags propagate from the original induction instruction.
9277   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9278   if (FPBinOp)
9279     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9280 
9281   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9282   Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9283   Value *DerivedIV = emitTransformedIndex(
9284       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9285       Kind, cast_if_present<BinaryOperator>(FPBinOp));
9286   DerivedIV->setName("offset.idx");
9287   if (TruncResultTy) {
9288     assert(TruncResultTy != DerivedIV->getType() &&
9289            Step->getType()->isIntegerTy() &&
9290            "Truncation requires an integer step");
9291     DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
9292   }
9293   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9294 
9295   State.set(this, DerivedIV, VPIteration(0, 0));
9296 }
9297 
9298 void VPInterleaveRecipe::execute(VPTransformState &State) {
9299   assert(!State.Instance && "Interleave group being replicated.");
9300   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9301                                       getStoredValues(), getMask(),
9302                                       NeedsMaskForGaps);
9303 }
9304 
9305 void VPReductionRecipe::execute(VPTransformState &State) {
9306   assert(!State.Instance && "Reduction being replicated.");
9307   Value *PrevInChain = State.get(getChainOp(), 0);
9308   RecurKind Kind = RdxDesc.getRecurrenceKind();
9309   bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
9310   // Propagate the fast-math flags carried by the underlying instruction.
9311   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9312   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
9313   for (unsigned Part = 0; Part < State.UF; ++Part) {
9314     Value *NewVecOp = State.get(getVecOp(), Part);
9315     if (VPValue *Cond = getCondOp()) {
9316       Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
9317                                            : State.get(Cond, {Part, 0});
9318       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
9319       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
9320       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
9321                                                   RdxDesc.getFastMathFlags());
9322       if (State.VF.isVector()) {
9323         Iden =
9324             State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9325       }
9326 
9327       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
9328       NewVecOp = Select;
9329     }
9330     Value *NewRed;
9331     Value *NextInChain;
9332     if (IsOrdered) {
9333       if (State.VF.isVector())
9334         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
9335                                         PrevInChain);
9336       else
9337         NewRed = State.Builder.CreateBinOp(
9338             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
9339             NewVecOp);
9340       PrevInChain = NewRed;
9341     } else {
9342       PrevInChain = State.get(getChainOp(), Part);
9343       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
9344     }
9345     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9346       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
9347                                    NewRed, PrevInChain);
9348     } else if (IsOrdered)
9349       NextInChain = NewRed;
9350     else
9351       NextInChain = State.Builder.CreateBinOp(
9352           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9353     State.set(this, NextInChain, Part);
9354   }
9355 }
9356 
9357 void VPReplicateRecipe::execute(VPTransformState &State) {
9358   Instruction *UI = getUnderlyingInstr();
9359   if (State.Instance) { // Generate a single instance.
9360     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9361     State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9362     // Insert scalar instance packing it into a vector.
9363     if (State.VF.isVector() && shouldPack()) {
9364       // If we're constructing lane 0, initialize to start from poison.
9365       if (State.Instance->Lane.isFirstLane()) {
9366         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9367         Value *Poison = PoisonValue::get(
9368             VectorType::get(UI->getType(), State.VF));
9369         State.set(this, Poison, State.Instance->Part);
9370       }
9371       State.packScalarIntoVectorValue(this, *State.Instance);
9372     }
9373     return;
9374   }
9375 
9376   if (IsUniform) {
9377     // If the recipe is uniform across all parts (instead of just per VF), only
9378     // generate a single instance.
9379     if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9380         all_of(operands(), [](VPValue *Op) {
9381           return Op->isDefinedOutsideVectorRegions();
9382         })) {
9383       State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9384       if (user_begin() != user_end()) {
9385         for (unsigned Part = 1; Part < State.UF; ++Part)
9386           State.set(this, State.get(this, VPIteration(0, 0)),
9387                     VPIteration(Part, 0));
9388       }
9389       return;
9390     }
9391 
9392     // Uniform within VL means we need to generate lane 0 only for each
9393     // unrolled copy.
9394     for (unsigned Part = 0; Part < State.UF; ++Part)
9395       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9396     return;
9397   }
9398 
9399   // A store of a loop varying value to a uniform address only needs the last
9400   // copy of the store.
9401   if (isa<StoreInst>(UI) &&
9402       vputils::isUniformAfterVectorization(getOperand(1))) {
9403     auto Lane = VPLane::getLastLaneForVF(State.VF);
9404     State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9405                                     State);
9406     return;
9407   }
9408 
9409   // Generate scalar instances for all VF lanes of all UF parts.
9410   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9411   const unsigned EndLane = State.VF.getKnownMinValue();
9412   for (unsigned Part = 0; Part < State.UF; ++Part)
9413     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9414       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9415 }
9416 
9417 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9418   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9419 
9420   // Attempt to issue a wide load.
9421   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9422   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9423 
9424   assert((LI || SI) && "Invalid Load/Store instruction");
9425   assert((!SI || StoredValue) && "No stored value provided for widened store");
9426   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9427 
9428   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9429 
9430   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9431   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9432   bool CreateGatherScatter = !isConsecutive();
9433 
9434   auto &Builder = State.Builder;
9435   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9436   bool isMaskRequired = getMask();
9437   if (isMaskRequired) {
9438     // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9439     // a null all-one mask is a null mask.
9440     for (unsigned Part = 0; Part < State.UF; ++Part) {
9441       Value *Mask = State.get(getMask(), Part);
9442       if (isReverse())
9443         Mask = Builder.CreateVectorReverse(Mask, "reverse");
9444       BlockInMaskParts[Part] = Mask;
9445     }
9446   }
9447 
9448   // Handle Stores:
9449   if (SI) {
9450     State.setDebugLocFrom(SI->getDebugLoc());
9451 
9452     for (unsigned Part = 0; Part < State.UF; ++Part) {
9453       Instruction *NewSI = nullptr;
9454       Value *StoredVal = State.get(StoredValue, Part);
9455       if (CreateGatherScatter) {
9456         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9457         Value *VectorGep = State.get(getAddr(), Part);
9458         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9459                                             MaskPart);
9460       } else {
9461         if (isReverse()) {
9462           // If we store to reverse consecutive memory locations, then we need
9463           // to reverse the order of elements in the stored value.
9464           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9465           // We don't want to update the value in the map as it might be used in
9466           // another expression. So don't call resetVectorValue(StoredVal).
9467         }
9468         auto *VecPtr = State.get(getAddr(), Part);
9469         if (isMaskRequired)
9470           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9471                                             BlockInMaskParts[Part]);
9472         else
9473           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9474       }
9475       State.addMetadata(NewSI, SI);
9476     }
9477     return;
9478   }
9479 
9480   // Handle loads.
9481   assert(LI && "Must have a load instruction");
9482   State.setDebugLocFrom(LI->getDebugLoc());
9483   for (unsigned Part = 0; Part < State.UF; ++Part) {
9484     Value *NewLI;
9485     if (CreateGatherScatter) {
9486       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9487       Value *VectorGep = State.get(getAddr(), Part);
9488       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9489                                          nullptr, "wide.masked.gather");
9490       State.addMetadata(NewLI, LI);
9491     } else {
9492       auto *VecPtr = State.get(getAddr(), Part);
9493       if (isMaskRequired)
9494         NewLI = Builder.CreateMaskedLoad(
9495             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9496             PoisonValue::get(DataTy), "wide.masked.load");
9497       else
9498         NewLI =
9499             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9500 
9501       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9502       State.addMetadata(NewLI, LI);
9503       if (Reverse)
9504         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9505     }
9506 
9507     State.set(getVPSingleValue(), NewLI, Part);
9508   }
9509 }
9510 
9511 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9512 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9513 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9514 // for predication.
9515 static ScalarEpilogueLowering getScalarEpilogueLowering(
9516     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9517     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9518     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9519   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9520   // don't look at hints or options, and don't request a scalar epilogue.
9521   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9522   // LoopAccessInfo (due to code dependency and not being able to reliably get
9523   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9524   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9525   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9526   // back to the old way and vectorize with versioning when forced. See D81345.)
9527   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9528                                                       PGSOQueryType::IRPass) &&
9529                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9530     return CM_ScalarEpilogueNotAllowedOptSize;
9531 
9532   // 2) If set, obey the directives
9533   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9534     switch (PreferPredicateOverEpilogue) {
9535     case PreferPredicateTy::ScalarEpilogue:
9536       return CM_ScalarEpilogueAllowed;
9537     case PreferPredicateTy::PredicateElseScalarEpilogue:
9538       return CM_ScalarEpilogueNotNeededUsePredicate;
9539     case PreferPredicateTy::PredicateOrDontVectorize:
9540       return CM_ScalarEpilogueNotAllowedUsePredicate;
9541     };
9542   }
9543 
9544   // 3) If set, obey the hints
9545   switch (Hints.getPredicate()) {
9546   case LoopVectorizeHints::FK_Enabled:
9547     return CM_ScalarEpilogueNotNeededUsePredicate;
9548   case LoopVectorizeHints::FK_Disabled:
9549     return CM_ScalarEpilogueAllowed;
9550   };
9551 
9552   // 4) if the TTI hook indicates this is profitable, request predication.
9553   TailFoldingInfo TFI(TLI, &LVL, IAI);
9554   if (TTI->preferPredicateOverEpilogue(&TFI))
9555     return CM_ScalarEpilogueNotNeededUsePredicate;
9556 
9557   return CM_ScalarEpilogueAllowed;
9558 }
9559 
9560 // Process the loop in the VPlan-native vectorization path. This path builds
9561 // VPlan upfront in the vectorization pipeline, which allows to apply
9562 // VPlan-to-VPlan transformations from the very beginning without modifying the
9563 // input LLVM IR.
9564 static bool processLoopInVPlanNativePath(
9565     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9566     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9567     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9568     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9569     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9570     LoopVectorizationRequirements &Requirements) {
9571 
9572   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9573     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9574     return false;
9575   }
9576   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9577   Function *F = L->getHeader()->getParent();
9578   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9579 
9580   ScalarEpilogueLowering SEL =
9581       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9582 
9583   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9584                                 &Hints, IAI);
9585   // Use the planner for outer loop vectorization.
9586   // TODO: CM is not used at this point inside the planner. Turn CM into an
9587   // optional argument if we don't need it in the future.
9588   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9589                                ORE);
9590 
9591   // Get user vectorization factor.
9592   ElementCount UserVF = Hints.getWidth();
9593 
9594   CM.collectElementTypesForWidening();
9595 
9596   // Plan how to best vectorize, return the best VF and its cost.
9597   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9598 
9599   // If we are stress testing VPlan builds, do not attempt to generate vector
9600   // code. Masked vector code generation support will follow soon.
9601   // Also, do not attempt to vectorize if no vector code will be produced.
9602   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9603     return false;
9604 
9605   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9606 
9607   {
9608     bool AddBranchWeights =
9609         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9610     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9611                              F->getParent()->getDataLayout(), AddBranchWeights);
9612     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9613                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9614     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9615                       << L->getHeader()->getParent()->getName() << "\"\n");
9616     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9617   }
9618 
9619   reportVectorization(ORE, L, VF, 1);
9620 
9621   // Mark the loop as already vectorized to avoid vectorizing again.
9622   Hints.setAlreadyVectorized();
9623   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9624   return true;
9625 }
9626 
9627 // Emit a remark if there are stores to floats that required a floating point
9628 // extension. If the vectorized loop was generated with floating point there
9629 // will be a performance penalty from the conversion overhead and the change in
9630 // the vector width.
9631 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9632   SmallVector<Instruction *, 4> Worklist;
9633   for (BasicBlock *BB : L->getBlocks()) {
9634     for (Instruction &Inst : *BB) {
9635       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9636         if (S->getValueOperand()->getType()->isFloatTy())
9637           Worklist.push_back(S);
9638       }
9639     }
9640   }
9641 
9642   // Traverse the floating point stores upwards searching, for floating point
9643   // conversions.
9644   SmallPtrSet<const Instruction *, 4> Visited;
9645   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9646   while (!Worklist.empty()) {
9647     auto *I = Worklist.pop_back_val();
9648     if (!L->contains(I))
9649       continue;
9650     if (!Visited.insert(I).second)
9651       continue;
9652 
9653     // Emit a remark if the floating point store required a floating
9654     // point conversion.
9655     // TODO: More work could be done to identify the root cause such as a
9656     // constant or a function return type and point the user to it.
9657     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9658       ORE->emit([&]() {
9659         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9660                                           I->getDebugLoc(), L->getHeader())
9661                << "floating point conversion changes vector width. "
9662                << "Mixed floating point precision requires an up/down "
9663                << "cast that will negatively impact performance.";
9664       });
9665 
9666     for (Use &Op : I->operands())
9667       if (auto *OpI = dyn_cast<Instruction>(Op))
9668         Worklist.push_back(OpI);
9669   }
9670 }
9671 
9672 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9673                                        VectorizationFactor &VF,
9674                                        std::optional<unsigned> VScale, Loop *L,
9675                                        ScalarEvolution &SE,
9676                                        ScalarEpilogueLowering SEL) {
9677   InstructionCost CheckCost = Checks.getCost();
9678   if (!CheckCost.isValid())
9679     return false;
9680 
9681   // When interleaving only scalar and vector cost will be equal, which in turn
9682   // would lead to a divide by 0. Fall back to hard threshold.
9683   if (VF.Width.isScalar()) {
9684     if (CheckCost > VectorizeMemoryCheckThreshold) {
9685       LLVM_DEBUG(
9686           dbgs()
9687           << "LV: Interleaving only is not profitable due to runtime checks\n");
9688       return false;
9689     }
9690     return true;
9691   }
9692 
9693   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9694   double ScalarC = *VF.ScalarCost.getValue();
9695   if (ScalarC == 0)
9696     return true;
9697 
9698   // First, compute the minimum iteration count required so that the vector
9699   // loop outperforms the scalar loop.
9700   //  The total cost of the scalar loop is
9701   //   ScalarC * TC
9702   //  where
9703   //  * TC is the actual trip count of the loop.
9704   //  * ScalarC is the cost of a single scalar iteration.
9705   //
9706   //  The total cost of the vector loop is
9707   //    RtC + VecC * (TC / VF) + EpiC
9708   //  where
9709   //  * RtC is the cost of the generated runtime checks
9710   //  * VecC is the cost of a single vector iteration.
9711   //  * TC is the actual trip count of the loop
9712   //  * VF is the vectorization factor
9713   //  * EpiCost is the cost of the generated epilogue, including the cost
9714   //    of the remaining scalar operations.
9715   //
9716   // Vectorization is profitable once the total vector cost is less than the
9717   // total scalar cost:
9718   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9719   //
9720   // Now we can compute the minimum required trip count TC as
9721   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9722   //
9723   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9724   // the computations are performed on doubles, not integers and the result
9725   // is rounded up, hence we get an upper estimate of the TC.
9726   unsigned IntVF = VF.Width.getKnownMinValue();
9727   if (VF.Width.isScalable()) {
9728     unsigned AssumedMinimumVscale = 1;
9729     if (VScale)
9730       AssumedMinimumVscale = *VScale;
9731     IntVF *= AssumedMinimumVscale;
9732   }
9733   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9734   double RtC = *CheckCost.getValue();
9735   double MinTC1 = RtC / (ScalarC - VecCOverVF);
9736 
9737   // Second, compute a minimum iteration count so that the cost of the
9738   // runtime checks is only a fraction of the total scalar loop cost. This
9739   // adds a loop-dependent bound on the overhead incurred if the runtime
9740   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9741   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9742   // cost, compute
9743   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
9744   double MinTC2 = RtC * 10 / ScalarC;
9745 
9746   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9747   // epilogue is allowed, choose the next closest multiple of VF. This should
9748   // partly compensate for ignoring the epilogue cost.
9749   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9750   if (SEL == CM_ScalarEpilogueAllowed)
9751     MinTC = alignTo(MinTC, IntVF);
9752   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9753 
9754   LLVM_DEBUG(
9755       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9756              << VF.MinProfitableTripCount << "\n");
9757 
9758   // Skip vectorization if the expected trip count is less than the minimum
9759   // required trip count.
9760   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9761     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9762                                 VF.MinProfitableTripCount)) {
9763       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9764                            "trip count < minimum profitable VF ("
9765                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
9766                         << ")\n");
9767 
9768       return false;
9769     }
9770   }
9771   return true;
9772 }
9773 
9774 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9775     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9776                                !EnableLoopInterleaving),
9777       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9778                               !EnableLoopVectorization) {}
9779 
9780 bool LoopVectorizePass::processLoop(Loop *L) {
9781   assert((EnableVPlanNativePath || L->isInnermost()) &&
9782          "VPlan-native path is not enabled. Only process inner loops.");
9783 
9784 #ifndef NDEBUG
9785   const std::string DebugLocStr = getDebugLocString(L);
9786 #endif /* NDEBUG */
9787 
9788   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9789                     << L->getHeader()->getParent()->getName() << "' from "
9790                     << DebugLocStr << "\n");
9791 
9792   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9793 
9794   LLVM_DEBUG(
9795       dbgs() << "LV: Loop hints:"
9796              << " force="
9797              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9798                      ? "disabled"
9799                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9800                             ? "enabled"
9801                             : "?"))
9802              << " width=" << Hints.getWidth()
9803              << " interleave=" << Hints.getInterleave() << "\n");
9804 
9805   // Function containing loop
9806   Function *F = L->getHeader()->getParent();
9807 
9808   // Looking at the diagnostic output is the only way to determine if a loop
9809   // was vectorized (other than looking at the IR or machine code), so it
9810   // is important to generate an optimization remark for each loop. Most of
9811   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9812   // generated as OptimizationRemark and OptimizationRemarkMissed are
9813   // less verbose reporting vectorized loops and unvectorized loops that may
9814   // benefit from vectorization, respectively.
9815 
9816   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9817     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9818     return false;
9819   }
9820 
9821   PredicatedScalarEvolution PSE(*SE, *L);
9822 
9823   // Check if it is legal to vectorize the loop.
9824   LoopVectorizationRequirements Requirements;
9825   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9826                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9827   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9828     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9829     Hints.emitRemarkWithHints();
9830     return false;
9831   }
9832 
9833   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9834   // here. They may require CFG and instruction level transformations before
9835   // even evaluating whether vectorization is profitable. Since we cannot modify
9836   // the incoming IR, we need to build VPlan upfront in the vectorization
9837   // pipeline.
9838   if (!L->isInnermost())
9839     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9840                                         ORE, BFI, PSI, Hints, Requirements);
9841 
9842   assert(L->isInnermost() && "Inner loop expected.");
9843 
9844   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9845   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9846 
9847   // If an override option has been passed in for interleaved accesses, use it.
9848   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9849     UseInterleaved = EnableInterleavedMemAccesses;
9850 
9851   // Analyze interleaved memory accesses.
9852   if (UseInterleaved)
9853     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9854 
9855   // Check the function attributes and profiles to find out if this function
9856   // should be optimized for size.
9857   ScalarEpilogueLowering SEL =
9858       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9859 
9860   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9861   // count by optimizing for size, to minimize overheads.
9862   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9863   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9864     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9865                       << "This loop is worth vectorizing only if no scalar "
9866                       << "iteration overheads are incurred.");
9867     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9868       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9869     else {
9870       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9871         LLVM_DEBUG(dbgs() << "\n");
9872         // Predicate tail-folded loops are efficient even when the loop
9873         // iteration count is low. However, setting the epilogue policy to
9874         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9875         // with runtime checks. It's more effective to let
9876         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9877         // for the loop.
9878         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9879           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9880       } else {
9881         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9882                              "small to consider vectorizing.\n");
9883         reportVectorizationFailure(
9884             "The trip count is below the minial threshold value.",
9885             "loop trip count is too low, avoiding vectorization",
9886             "LowTripCount", ORE, L);
9887         Hints.emitRemarkWithHints();
9888         return false;
9889       }
9890     }
9891   }
9892 
9893   // Check the function attributes to see if implicit floats or vectors are
9894   // allowed.
9895   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9896     reportVectorizationFailure(
9897         "Can't vectorize when the NoImplicitFloat attribute is used",
9898         "loop not vectorized due to NoImplicitFloat attribute",
9899         "NoImplicitFloat", ORE, L);
9900     Hints.emitRemarkWithHints();
9901     return false;
9902   }
9903 
9904   // Check if the target supports potentially unsafe FP vectorization.
9905   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9906   // for the target we're vectorizing for, to make sure none of the
9907   // additional fp-math flags can help.
9908   if (Hints.isPotentiallyUnsafe() &&
9909       TTI->isFPVectorizationPotentiallyUnsafe()) {
9910     reportVectorizationFailure(
9911         "Potentially unsafe FP op prevents vectorization",
9912         "loop not vectorized due to unsafe FP support.",
9913         "UnsafeFP", ORE, L);
9914     Hints.emitRemarkWithHints();
9915     return false;
9916   }
9917 
9918   bool AllowOrderedReductions;
9919   // If the flag is set, use that instead and override the TTI behaviour.
9920   if (ForceOrderedReductions.getNumOccurrences() > 0)
9921     AllowOrderedReductions = ForceOrderedReductions;
9922   else
9923     AllowOrderedReductions = TTI->enableOrderedReductions();
9924   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9925     ORE->emit([&]() {
9926       auto *ExactFPMathInst = Requirements.getExactFPInst();
9927       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9928                                                  ExactFPMathInst->getDebugLoc(),
9929                                                  ExactFPMathInst->getParent())
9930              << "loop not vectorized: cannot prove it is safe to reorder "
9931                 "floating-point operations";
9932     });
9933     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9934                          "reorder floating-point operations\n");
9935     Hints.emitRemarkWithHints();
9936     return false;
9937   }
9938 
9939   // Use the cost model.
9940   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9941                                 F, &Hints, IAI);
9942   // Use the planner for vectorization.
9943   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9944                                ORE);
9945 
9946   // Get user vectorization factor and interleave count.
9947   ElementCount UserVF = Hints.getWidth();
9948   unsigned UserIC = Hints.getInterleave();
9949 
9950   // Plan how to best vectorize, return the best VF and its cost.
9951   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9952 
9953   VectorizationFactor VF = VectorizationFactor::Disabled();
9954   unsigned IC = 1;
9955 
9956   bool AddBranchWeights =
9957       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9958   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9959                            F->getParent()->getDataLayout(), AddBranchWeights);
9960   if (MaybeVF) {
9961     VF = *MaybeVF;
9962     // Select the interleave count.
9963     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9964 
9965     unsigned SelectedIC = std::max(IC, UserIC);
9966     //  Optimistically generate runtime checks if they are needed. Drop them if
9967     //  they turn out to not be profitable.
9968     if (VF.Width.isVector() || SelectedIC > 1)
9969       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9970 
9971     // Check if it is profitable to vectorize with runtime checks.
9972     bool ForceVectorization =
9973         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9974     if (!ForceVectorization &&
9975         !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
9976                                     *PSE.getSE(), SEL)) {
9977       ORE->emit([&]() {
9978         return OptimizationRemarkAnalysisAliasing(
9979                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9980                    L->getHeader())
9981                << "loop not vectorized: cannot prove it is safe to reorder "
9982                   "memory operations";
9983       });
9984       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9985       Hints.emitRemarkWithHints();
9986       return false;
9987     }
9988   }
9989 
9990   // Identify the diagnostic messages that should be produced.
9991   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9992   bool VectorizeLoop = true, InterleaveLoop = true;
9993   if (VF.Width.isScalar()) {
9994     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9995     VecDiagMsg = std::make_pair(
9996         "VectorizationNotBeneficial",
9997         "the cost-model indicates that vectorization is not beneficial");
9998     VectorizeLoop = false;
9999   }
10000 
10001   if (!MaybeVF && UserIC > 1) {
10002     // Tell the user interleaving was avoided up-front, despite being explicitly
10003     // requested.
10004     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10005                          "interleaving should be avoided up front\n");
10006     IntDiagMsg = std::make_pair(
10007         "InterleavingAvoided",
10008         "Ignoring UserIC, because interleaving was avoided up front");
10009     InterleaveLoop = false;
10010   } else if (IC == 1 && UserIC <= 1) {
10011     // Tell the user interleaving is not beneficial.
10012     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10013     IntDiagMsg = std::make_pair(
10014         "InterleavingNotBeneficial",
10015         "the cost-model indicates that interleaving is not beneficial");
10016     InterleaveLoop = false;
10017     if (UserIC == 1) {
10018       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10019       IntDiagMsg.second +=
10020           " and is explicitly disabled or interleave count is set to 1";
10021     }
10022   } else if (IC > 1 && UserIC == 1) {
10023     // Tell the user interleaving is beneficial, but it explicitly disabled.
10024     LLVM_DEBUG(
10025         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10026     IntDiagMsg = std::make_pair(
10027         "InterleavingBeneficialButDisabled",
10028         "the cost-model indicates that interleaving is beneficial "
10029         "but is explicitly disabled or interleave count is set to 1");
10030     InterleaveLoop = false;
10031   }
10032 
10033   // Override IC if user provided an interleave count.
10034   IC = UserIC > 0 ? UserIC : IC;
10035 
10036   // Emit diagnostic messages, if any.
10037   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10038   if (!VectorizeLoop && !InterleaveLoop) {
10039     // Do not vectorize or interleaving the loop.
10040     ORE->emit([&]() {
10041       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10042                                       L->getStartLoc(), L->getHeader())
10043              << VecDiagMsg.second;
10044     });
10045     ORE->emit([&]() {
10046       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10047                                       L->getStartLoc(), L->getHeader())
10048              << IntDiagMsg.second;
10049     });
10050     return false;
10051   } else if (!VectorizeLoop && InterleaveLoop) {
10052     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10053     ORE->emit([&]() {
10054       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10055                                         L->getStartLoc(), L->getHeader())
10056              << VecDiagMsg.second;
10057     });
10058   } else if (VectorizeLoop && !InterleaveLoop) {
10059     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10060                       << ") in " << DebugLocStr << '\n');
10061     ORE->emit([&]() {
10062       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10063                                         L->getStartLoc(), L->getHeader())
10064              << IntDiagMsg.second;
10065     });
10066   } else if (VectorizeLoop && InterleaveLoop) {
10067     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10068                       << ") in " << DebugLocStr << '\n');
10069     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10070   }
10071 
10072   bool DisableRuntimeUnroll = false;
10073   MDNode *OrigLoopID = L->getLoopID();
10074   {
10075     using namespace ore;
10076     if (!VectorizeLoop) {
10077       assert(IC > 1 && "interleave count should not be 1 or 0");
10078       // If we decided that it is not legal to vectorize the loop, then
10079       // interleave it.
10080       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10081                                  &CM, BFI, PSI, Checks);
10082 
10083       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10084       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10085 
10086       ORE->emit([&]() {
10087         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10088                                   L->getHeader())
10089                << "interleaved loop (interleaved count: "
10090                << NV("InterleaveCount", IC) << ")";
10091       });
10092     } else {
10093       // If we decided that it is *legal* to vectorize the loop, then do it.
10094 
10095       // Consider vectorizing the epilogue too if it's profitable.
10096       VectorizationFactor EpilogueVF =
10097           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10098       if (EpilogueVF.Width.isVector()) {
10099 
10100         // The first pass vectorizes the main loop and creates a scalar epilogue
10101         // to be vectorized by executing the plan (potentially with a different
10102         // factor) again shortly afterwards.
10103         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10104         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10105                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10106 
10107         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10108         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10109             EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
10110         ++LoopsVectorized;
10111 
10112         // Second pass vectorizes the epilogue and adjusts the control flow
10113         // edges from the first pass.
10114         EPI.MainLoopVF = EPI.EpilogueVF;
10115         EPI.MainLoopUF = EPI.EpilogueUF;
10116         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10117                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10118                                                  Checks);
10119 
10120         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10121         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10122         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10123         Header->setName("vec.epilog.vector.body");
10124 
10125         // Re-use the trip count and steps expanded for the main loop, as
10126         // skeleton creation needs it as a value that dominates both the scalar
10127         // and vector epilogue loops
10128         // TODO: This is a workaround needed for epilogue vectorization and it
10129         // should be removed once induction resume value creation is done
10130         // directly in VPlan.
10131         EpilogILV.setTripCount(MainILV.getTripCount());
10132         for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10133           auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10134           auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
10135               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10136           ExpandR->replaceAllUsesWith(ExpandedVal);
10137           ExpandR->eraseFromParent();
10138         }
10139 
10140         // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10141         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10142         // before vectorizing the epilogue loop.
10143         for (VPRecipeBase &R : Header->phis()) {
10144           if (isa<VPCanonicalIVPHIRecipe>(&R))
10145             continue;
10146 
10147           Value *ResumeV = nullptr;
10148           // TODO: Move setting of resume values to prepareToExecute.
10149           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10150             ResumeV = ReductionResumeValues
10151                           .find(&ReductionPhi->getRecurrenceDescriptor())
10152                           ->second;
10153           } else {
10154             // Create induction resume values for both widened pointer and
10155             // integer/fp inductions and update the start value of the induction
10156             // recipes to use the resume value.
10157             PHINode *IndPhi = nullptr;
10158             const InductionDescriptor *ID;
10159             if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10160               IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10161               ID = &Ind->getInductionDescriptor();
10162             } else {
10163               auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10164               IndPhi = WidenInd->getPHINode();
10165               ID = &WidenInd->getInductionDescriptor();
10166             }
10167 
10168             ResumeV = MainILV.createInductionResumeValue(
10169                 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10170                 {EPI.MainLoopIterationCountCheck});
10171           }
10172           assert(ResumeV && "Must have a resume value");
10173           VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
10174           cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10175         }
10176 
10177         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10178                         DT, true, &ExpandedSCEVs);
10179         ++LoopsEpilogueVectorized;
10180 
10181         if (!MainILV.areSafetyChecksAdded())
10182           DisableRuntimeUnroll = true;
10183       } else {
10184         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10185                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10186                                PSI, Checks);
10187 
10188         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10189         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10190         ++LoopsVectorized;
10191 
10192         // Add metadata to disable runtime unrolling a scalar loop when there
10193         // are no runtime checks about strides and memory. A scalar loop that is
10194         // rarely used is not worth unrolling.
10195         if (!LB.areSafetyChecksAdded())
10196           DisableRuntimeUnroll = true;
10197       }
10198       // Report the vectorization decision.
10199       reportVectorization(ORE, L, VF, IC);
10200     }
10201 
10202     if (ORE->allowExtraAnalysis(LV_NAME))
10203       checkMixedPrecision(L, ORE);
10204   }
10205 
10206   std::optional<MDNode *> RemainderLoopID =
10207       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10208                                       LLVMLoopVectorizeFollowupEpilogue});
10209   if (RemainderLoopID) {
10210     L->setLoopID(*RemainderLoopID);
10211   } else {
10212     if (DisableRuntimeUnroll)
10213       AddRuntimeUnrollDisableMetaData(L);
10214 
10215     // Mark the loop as already vectorized to avoid vectorizing again.
10216     Hints.setAlreadyVectorized();
10217   }
10218 
10219   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10220   return true;
10221 }
10222 
10223 LoopVectorizeResult LoopVectorizePass::runImpl(
10224     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10225     DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10226     DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10227     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10228   SE = &SE_;
10229   LI = &LI_;
10230   TTI = &TTI_;
10231   DT = &DT_;
10232   BFI = BFI_;
10233   TLI = TLI_;
10234   AC = &AC_;
10235   LAIs = &LAIs_;
10236   DB = &DB_;
10237   ORE = &ORE_;
10238   PSI = PSI_;
10239 
10240   // Don't attempt if
10241   // 1. the target claims to have no vector registers, and
10242   // 2. interleaving won't help ILP.
10243   //
10244   // The second condition is necessary because, even if the target has no
10245   // vector registers, loop vectorization may still enable scalar
10246   // interleaving.
10247   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10248       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10249     return LoopVectorizeResult(false, false);
10250 
10251   bool Changed = false, CFGChanged = false;
10252 
10253   // The vectorizer requires loops to be in simplified form.
10254   // Since simplification may add new inner loops, it has to run before the
10255   // legality and profitability checks. This means running the loop vectorizer
10256   // will simplify all loops, regardless of whether anything end up being
10257   // vectorized.
10258   for (const auto &L : *LI)
10259     Changed |= CFGChanged |=
10260         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10261 
10262   // Build up a worklist of inner-loops to vectorize. This is necessary as
10263   // the act of vectorizing or partially unrolling a loop creates new loops
10264   // and can invalidate iterators across the loops.
10265   SmallVector<Loop *, 8> Worklist;
10266 
10267   for (Loop *L : *LI)
10268     collectSupportedLoops(*L, LI, ORE, Worklist);
10269 
10270   LoopsAnalyzed += Worklist.size();
10271 
10272   // Now walk the identified inner loops.
10273   while (!Worklist.empty()) {
10274     Loop *L = Worklist.pop_back_val();
10275 
10276     // For the inner loops we actually process, form LCSSA to simplify the
10277     // transform.
10278     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10279 
10280     Changed |= CFGChanged |= processLoop(L);
10281 
10282     if (Changed) {
10283       LAIs->clear();
10284 
10285 #ifndef NDEBUG
10286       if (VerifySCEV)
10287         SE->verify();
10288 #endif
10289     }
10290   }
10291 
10292   // Process each loop nest in the function.
10293   return LoopVectorizeResult(Changed, CFGChanged);
10294 }
10295 
10296 PreservedAnalyses LoopVectorizePass::run(Function &F,
10297                                          FunctionAnalysisManager &AM) {
10298     auto &LI = AM.getResult<LoopAnalysis>(F);
10299     // There are no loops in the function. Return before computing other expensive
10300     // analyses.
10301     if (LI.empty())
10302       return PreservedAnalyses::all();
10303     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10304     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10305     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10306     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10307     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10308     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10309     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10310 
10311     LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10312     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10313     ProfileSummaryInfo *PSI =
10314         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10315     BlockFrequencyInfo *BFI = nullptr;
10316     if (PSI && PSI->hasProfileSummary())
10317       BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10318     LoopVectorizeResult Result =
10319         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10320     if (!Result.MadeAnyChange)
10321       return PreservedAnalyses::all();
10322     PreservedAnalyses PA;
10323 
10324     if (isAssignmentTrackingEnabled(*F.getParent())) {
10325       for (auto &BB : F)
10326         RemoveRedundantDbgInstrs(&BB);
10327     }
10328 
10329     // We currently do not preserve loopinfo/dominator analyses with outer loop
10330     // vectorization. Until this is addressed, mark these analyses as preserved
10331     // only for non-VPlan-native path.
10332     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10333     if (!EnableVPlanNativePath) {
10334       PA.preserve<LoopAnalysis>();
10335       PA.preserve<DominatorTreeAnalysis>();
10336       PA.preserve<ScalarEvolutionAnalysis>();
10337     }
10338 
10339     if (Result.MadeCFGChange) {
10340       // Making CFG changes likely means a loop got vectorized. Indicate that
10341       // extra simplification passes should be run.
10342       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10343       // be run if runtime checks have been added.
10344       AM.getResult<ShouldRunExtraVectorPasses>(F);
10345       PA.preserve<ShouldRunExtraVectorPasses>();
10346     } else {
10347       PA.preserveSet<CFGAnalyses>();
10348     }
10349     return PA;
10350 }
10351 
10352 void LoopVectorizePass::printPipeline(
10353     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10354   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10355       OS, MapClassName2PassName);
10356 
10357   OS << '<';
10358   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10359   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10360   OS << '>';
10361 }
10362