xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/STLExtras.h"
70 #include "llvm/ADT/SmallPtrSet.h"
71 #include "llvm/ADT/SmallSet.h"
72 #include "llvm/ADT/SmallVector.h"
73 #include "llvm/ADT/Statistic.h"
74 #include "llvm/ADT/StringRef.h"
75 #include "llvm/ADT/Twine.h"
76 #include "llvm/ADT/iterator_range.h"
77 #include "llvm/Analysis/AssumptionCache.h"
78 #include "llvm/Analysis/BasicAliasAnalysis.h"
79 #include "llvm/Analysis/BlockFrequencyInfo.h"
80 #include "llvm/Analysis/CFG.h"
81 #include "llvm/Analysis/CodeMetrics.h"
82 #include "llvm/Analysis/DemandedBits.h"
83 #include "llvm/Analysis/GlobalsModRef.h"
84 #include "llvm/Analysis/LoopAccessAnalysis.h"
85 #include "llvm/Analysis/LoopAnalysisManager.h"
86 #include "llvm/Analysis/LoopInfo.h"
87 #include "llvm/Analysis/LoopIterator.h"
88 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
89 #include "llvm/Analysis/ProfileSummaryInfo.h"
90 #include "llvm/Analysis/ScalarEvolution.h"
91 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
92 #include "llvm/Analysis/TargetLibraryInfo.h"
93 #include "llvm/Analysis/TargetTransformInfo.h"
94 #include "llvm/Analysis/ValueTracking.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfo.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/MDBuilder.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/ProfDataUtils.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cmath>
146 #include <cstdint>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <map>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(
251             TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252             "data-and-control-without-rt-check",
253             "Similar to data-and-control, but remove the runtime check")));
254 
255 static cl::opt<bool> MaximizeBandwidth(
256     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
257     cl::desc("Maximize bandwidth when selecting vectorization factor which "
258              "will be determined by the smallest type in loop."));
259 
260 static cl::opt<bool> EnableInterleavedMemAccesses(
261     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
262     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
263 
264 /// An interleave-group may need masking if it resides in a block that needs
265 /// predication, or in order to mask away gaps.
266 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
267     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
269 
270 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272     cl::desc("We don't interleave loops with a estimated constant trip count "
273              "below this number"));
274 
275 static cl::opt<unsigned> ForceTargetNumScalarRegs(
276     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's number of scalar registers."));
278 
279 static cl::opt<unsigned> ForceTargetNumVectorRegs(
280     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
281     cl::desc("A flag that overrides the target's number of vector registers."));
282 
283 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
284     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
285     cl::desc("A flag that overrides the target's max interleave factor for "
286              "scalar loops."));
287 
288 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
289     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
290     cl::desc("A flag that overrides the target's max interleave factor for "
291              "vectorized loops."));
292 
293 static cl::opt<unsigned> ForceTargetInstructionCost(
294     "force-target-instruction-cost", cl::init(0), cl::Hidden,
295     cl::desc("A flag that overrides the target's expected cost for "
296              "an instruction to a single constant value. Mostly "
297              "useful for getting consistent testing."));
298 
299 static cl::opt<bool> ForceTargetSupportsScalableVectors(
300     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
301     cl::desc(
302         "Pretend that scalable vectors are supported, even if the target does "
303         "not support them. This flag should only be used for testing."));
304 
305 static cl::opt<unsigned> SmallLoopCost(
306     "small-loop-cost", cl::init(20), cl::Hidden,
307     cl::desc(
308         "The cost of a loop that is considered 'small' by the interleaver."));
309 
310 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
311     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
312     cl::desc("Enable the use of the block frequency analysis to access PGO "
313              "heuristics minimizing code growth in cold regions and being more "
314              "aggressive in hot regions."));
315 
316 // Runtime interleave loops for load/store throughput.
317 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
318     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
319     cl::desc(
320         "Enable runtime interleaving until load/store ports are saturated"));
321 
322 /// Interleave small loops with scalar reductions.
323 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
324     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
325     cl::desc("Enable interleaving for loops with small iteration counts that "
326              "contain scalar reductions to expose ILP."));
327 
328 /// The number of stores in a loop that are allowed to need predication.
329 static cl::opt<unsigned> NumberOfStoresToPredicate(
330     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
331     cl::desc("Max number of stores to be predicated behind an if."));
332 
333 static cl::opt<bool> EnableIndVarRegisterHeur(
334     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
335     cl::desc("Count the induction variable only once when interleaving"));
336 
337 static cl::opt<bool> EnableCondStoresVectorization(
338     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
339     cl::desc("Enable if predication of stores during vectorization."));
340 
341 static cl::opt<unsigned> MaxNestedScalarReductionIC(
342     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
343     cl::desc("The maximum interleave count to use when interleaving a scalar "
344              "reduction in a nested loop."));
345 
346 static cl::opt<bool>
347     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
348                            cl::Hidden,
349                            cl::desc("Prefer in-loop vector reductions, "
350                                     "overriding the targets preference."));
351 
352 static cl::opt<bool> ForceOrderedReductions(
353     "force-ordered-reductions", cl::init(false), cl::Hidden,
354     cl::desc("Enable the vectorisation of loops with in-order (strict) "
355              "FP reductions"));
356 
357 static cl::opt<bool> PreferPredicatedReductionSelect(
358     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
359     cl::desc(
360         "Prefer predicating a reduction operation over an after loop select."));
361 
362 namespace llvm {
363 cl::opt<bool> EnableVPlanNativePath(
364     "enable-vplan-native-path", cl::Hidden,
365     cl::desc("Enable VPlan-native vectorization path with "
366              "support for outer loop vectorization."));
367 }
368 
369 // This flag enables the stress testing of the VPlan H-CFG construction in the
370 // VPlan-native vectorization path. It must be used in conjuction with
371 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372 // verification of the H-CFGs built.
373 static cl::opt<bool> VPlanBuildStressTest(
374     "vplan-build-stress-test", cl::init(false), cl::Hidden,
375     cl::desc(
376         "Build VPlan for every supported loop nest in the function and bail "
377         "out right after the build (stress test the VPlan H-CFG construction "
378         "in the VPlan-native vectorization path)."));
379 
380 cl::opt<bool> llvm::EnableLoopInterleaving(
381     "interleave-loops", cl::init(true), cl::Hidden,
382     cl::desc("Enable loop interleaving in Loop vectorization passes"));
383 cl::opt<bool> llvm::EnableLoopVectorization(
384     "vectorize-loops", cl::init(true), cl::Hidden,
385     cl::desc("Run the Loop vectorization passes"));
386 
387 static cl::opt<bool> PrintVPlansInDotFormat(
388     "vplan-print-in-dot-format", cl::Hidden,
389     cl::desc("Use dot format instead of plain text when dumping VPlans"));
390 
391 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
392     "force-widen-divrem-via-safe-divisor", cl::Hidden,
393     cl::desc(
394         "Override cost based safe divisor widening for div/rem instructions"));
395 
396 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
397     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398     cl::Hidden,
399     cl::desc("Try wider VFs if they enable the use of vector variants"));
400 
401 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402 // variables not overflowing do not hold. See `emitSCEVChecks`.
403 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
405 // `emitMemRuntimeChecks`.
406 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407 // Likelyhood of bypassing the vectorized loop because there are zero trips left
408 // after prolog. See `emitIterationCountCheck`.
409 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410 
411 /// A helper function that returns true if the given type is irregular. The
412 /// type is irregular if its allocated size doesn't equal the store size of an
413 /// element of the corresponding vector type.
414 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
415   // Determine if an array of N elements of type Ty is "bitcast compatible"
416   // with a <N x Ty> vector.
417   // This is only true if there is no padding between the array elements.
418   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
419 }
420 
421 /// A helper function that returns the reciprocal of the block probability of
422 /// predicated blocks. If we return X, we are assuming the predicated block
423 /// will execute once for every X iterations of the loop header.
424 ///
425 /// TODO: We should use actual block probability here, if available. Currently,
426 ///       we always assume predicated blocks have a 50% chance of executing.
427 static unsigned getReciprocalPredBlockProb() { return 2; }
428 
429 /// Returns "best known" trip count for the specified loop \p L as defined by
430 /// the following procedure:
431 ///   1) Returns exact trip count if it is known.
432 ///   2) Returns expected trip count according to profile data if any.
433 ///   3) Returns upper bound estimate if it is known.
434 ///   4) Returns std::nullopt if all of the above failed.
435 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
436                                                    Loop *L) {
437   // Check if exact trip count is known.
438   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
439     return ExpectedTC;
440 
441   // Check if there is an expected trip count available from profile data.
442   if (LoopVectorizeWithBlockFrequency)
443     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
444       return *EstimatedTC;
445 
446   // Check if upper bound estimate is known.
447   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
448     return ExpectedTC;
449 
450   return std::nullopt;
451 }
452 
453 /// Return a vector containing interleaved elements from multiple
454 /// smaller input vectors.
455 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
456                                 const Twine &Name) {
457   unsigned Factor = Vals.size();
458   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
459 
460   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
461 #ifndef NDEBUG
462   for (Value *Val : Vals)
463     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
464 #endif
465 
466   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
467   // must use intrinsics to interleave.
468   if (VecTy->isScalableTy()) {
469     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
470     return Builder.CreateIntrinsic(
471         WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
472         /*FMFSource=*/nullptr, Name);
473   }
474 
475   // Fixed length. Start by concatenating all vectors into a wide vector.
476   Value *WideVec = concatenateVectors(Builder, Vals);
477 
478   // Interleave the elements into the wide vector.
479   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
480   return Builder.CreateShuffleVector(
481       WideVec, createInterleaveMask(NumElts, Factor), Name);
482 }
483 
484 namespace {
485 // Forward declare GeneratedRTChecks.
486 class GeneratedRTChecks;
487 
488 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489 } // namespace
490 
491 namespace llvm {
492 
493 AnalysisKey ShouldRunExtraVectorPasses::Key;
494 
495 /// InnerLoopVectorizer vectorizes loops which contain only one basic
496 /// block to a specified vectorization factor (VF).
497 /// This class performs the widening of scalars into vectors, or multiple
498 /// scalars. This class also implements the following features:
499 /// * It inserts an epilogue loop for handling loops that don't have iteration
500 ///   counts that are known to be a multiple of the vectorization factor.
501 /// * It handles the code generation for reduction variables.
502 /// * Scalarization (implementation using scalars) of un-vectorizable
503 ///   instructions.
504 /// InnerLoopVectorizer does not perform any vectorization-legality
505 /// checks, and relies on the caller to check for the different legality
506 /// aspects. The InnerLoopVectorizer relies on the
507 /// LoopVectorizationLegality class to provide information about the induction
508 /// and reduction variables that were found to a given vectorization factor.
509 class InnerLoopVectorizer {
510 public:
511   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
512                       LoopInfo *LI, DominatorTree *DT,
513                       const TargetLibraryInfo *TLI,
514                       const TargetTransformInfo *TTI, AssumptionCache *AC,
515                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
516                       ElementCount MinProfitableTripCount,
517                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
518                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
519                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
520       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
521         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
522         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
523         PSI(PSI), RTChecks(RTChecks) {
524     // Query this against the original loop and save it here because the profile
525     // of the original loop header may change as the transformation happens.
526     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
527         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
528 
529     if (MinProfitableTripCount.isZero())
530       this->MinProfitableTripCount = VecWidth;
531     else
532       this->MinProfitableTripCount = MinProfitableTripCount;
533   }
534 
535   virtual ~InnerLoopVectorizer() = default;
536 
537   /// Create a new empty loop that will contain vectorized instructions later
538   /// on, while the old loop will be used as the scalar remainder. Control flow
539   /// is generated around the vectorized (and scalar epilogue) loops consisting
540   /// of various checks and bypasses. Return the pre-header block of the new
541   /// loop and the start value for the canonical induction, if it is != 0. The
542   /// latter is the case when vectorizing the epilogue loop. In the case of
543   /// epilogue vectorization, this function is overriden to handle the more
544   /// complex control flow around the loops.  \p ExpandedSCEVs is used to
545   /// look up SCEV expansions for expressions needed during skeleton creation.
546   virtual std::pair<BasicBlock *, Value *>
547   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
548 
549   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
550   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
551 
552   // Return true if any runtime check is added.
553   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
554 
555   /// A type for vectorized values in the new loop. Each value from the
556   /// original loop, when vectorized, is represented by UF vector values in the
557   /// new unrolled loop, where UF is the unroll factor.
558   using VectorParts = SmallVector<Value *, 2>;
559 
560   /// A helper function to scalarize a single Instruction in the innermost loop.
561   /// Generates a sequence of scalar instances for each lane between \p MinLane
562   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
563   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
564   /// Instr's operands.
565   void scalarizeInstruction(const Instruction *Instr,
566                             VPReplicateRecipe *RepRecipe,
567                             const VPIteration &Instance,
568                             VPTransformState &State);
569 
570   /// Try to vectorize interleaved access group \p Group with the base address
571   /// given in \p Addr, optionally masking the vector operations if \p
572   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
573   /// values in the vectorized loop.
574   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
575                                 ArrayRef<VPValue *> VPDefs,
576                                 VPTransformState &State, VPValue *Addr,
577                                 ArrayRef<VPValue *> StoredValues,
578                                 VPValue *BlockInMask, bool NeedsMaskForGaps);
579 
580   /// Fix the non-induction PHIs in \p Plan.
581   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
582 
583   /// Returns true if the reordering of FP operations is not allowed, but we are
584   /// able to vectorize with strict in-order reductions for the given RdxDesc.
585   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
586 
587   /// Create a new phi node for the induction variable \p OrigPhi to resume
588   /// iteration count in the scalar epilogue, from where the vectorized loop
589   /// left off. \p Step is the SCEV-expanded induction step to use. In cases
590   /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
591   /// and the resume values can come from an additional bypass block, the \p
592   /// AdditionalBypass pair provides information about the bypass block and the
593   /// end value on the edge from bypass to this loop.
594   PHINode *createInductionResumeValue(
595       PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
596       ArrayRef<BasicBlock *> BypassBlocks,
597       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
598 
599   /// Returns the original loop trip count.
600   Value *getTripCount() const { return TripCount; }
601 
602   /// Used to set the trip count after ILV's construction and after the
603   /// preheader block has been executed. Note that this always holds the trip
604   /// count of the original loop for both main loop and epilogue vectorization.
605   void setTripCount(Value *TC) { TripCount = TC; }
606 
607 protected:
608   friend class LoopVectorizationPlanner;
609 
610   /// A small list of PHINodes.
611   using PhiVector = SmallVector<PHINode *, 4>;
612 
613   /// A type for scalarized values in the new loop. Each value from the
614   /// original loop, when scalarized, is represented by UF x VF scalar values
615   /// in the new unrolled loop, where UF is the unroll factor and VF is the
616   /// vectorization factor.
617   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
618 
619   /// Set up the values of the IVs correctly when exiting the vector loop.
620   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
621                     Value *VectorTripCount, Value *EndValue,
622                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
623                     VPlan &Plan, VPTransformState &State);
624 
625   /// Create the exit value of first order recurrences in the middle block and
626   /// update their users.
627   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
628                                VPTransformState &State);
629 
630   /// Create code for the loop exit value of the reduction.
631   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
632 
633   /// Iteratively sink the scalarized operands of a predicated instruction into
634   /// the block that was created for it.
635   void sinkScalarOperands(Instruction *PredInst);
636 
637   /// Returns (and creates if needed) the trip count of the widened loop.
638   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
639 
640   /// Returns a bitcasted value to the requested vector type.
641   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
642   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
643                                 const DataLayout &DL);
644 
645   /// Emit a bypass check to see if the vector trip count is zero, including if
646   /// it overflows.
647   void emitIterationCountCheck(BasicBlock *Bypass);
648 
649   /// Emit a bypass check to see if all of the SCEV assumptions we've
650   /// had to make are correct. Returns the block containing the checks or
651   /// nullptr if no checks have been added.
652   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
653 
654   /// Emit bypass checks to check any memory assumptions we may have made.
655   /// Returns the block containing the checks or nullptr if no checks have been
656   /// added.
657   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
658 
659   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
660   /// vector loop preheader, middle block and scalar preheader.
661   void createVectorLoopSkeleton(StringRef Prefix);
662 
663   /// Create new phi nodes for the induction variables to resume iteration count
664   /// in the scalar epilogue, from where the vectorized loop left off.
665   /// In cases where the loop skeleton is more complicated (eg. epilogue
666   /// vectorization) and the resume values can come from an additional bypass
667   /// block, the \p AdditionalBypass pair provides information about the bypass
668   /// block and the end value on the edge from bypass to this loop.
669   void createInductionResumeValues(
670       const SCEV2ValueTy &ExpandedSCEVs,
671       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
672 
673   /// Complete the loop skeleton by adding debug MDs, creating appropriate
674   /// conditional branches in the middle block, preparing the builder and
675   /// running the verifier. Return the preheader of the completed vector loop.
676   BasicBlock *completeLoopSkeleton();
677 
678   /// Collect poison-generating recipes that may generate a poison value that is
679   /// used after vectorization, even when their operands are not poison. Those
680   /// recipes meet the following conditions:
681   ///  * Contribute to the address computation of a recipe generating a widen
682   ///    memory load/store (VPWidenMemoryInstructionRecipe or
683   ///    VPInterleaveRecipe).
684   ///  * Such a widen memory load/store has at least one underlying Instruction
685   ///    that is in a basic block that needs predication and after vectorization
686   ///    the generated instruction won't be predicated.
687   void collectPoisonGeneratingRecipes(VPTransformState &State);
688 
689   /// Allow subclasses to override and print debug traces before/after vplan
690   /// execution, when trace information is requested.
691   virtual void printDebugTracesAtStart(){};
692   virtual void printDebugTracesAtEnd(){};
693 
694   /// The original loop.
695   Loop *OrigLoop;
696 
697   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
698   /// dynamic knowledge to simplify SCEV expressions and converts them to a
699   /// more usable form.
700   PredicatedScalarEvolution &PSE;
701 
702   /// Loop Info.
703   LoopInfo *LI;
704 
705   /// Dominator Tree.
706   DominatorTree *DT;
707 
708   /// Target Library Info.
709   const TargetLibraryInfo *TLI;
710 
711   /// Target Transform Info.
712   const TargetTransformInfo *TTI;
713 
714   /// Assumption Cache.
715   AssumptionCache *AC;
716 
717   /// Interface to emit optimization remarks.
718   OptimizationRemarkEmitter *ORE;
719 
720   /// The vectorization SIMD factor to use. Each vector will have this many
721   /// vector elements.
722   ElementCount VF;
723 
724   ElementCount MinProfitableTripCount;
725 
726   /// The vectorization unroll factor to use. Each scalar is vectorized to this
727   /// many different vector instructions.
728   unsigned UF;
729 
730   /// The builder that we use
731   IRBuilder<> Builder;
732 
733   // --- Vectorization state ---
734 
735   /// The vector-loop preheader.
736   BasicBlock *LoopVectorPreHeader;
737 
738   /// The scalar-loop preheader.
739   BasicBlock *LoopScalarPreHeader;
740 
741   /// Middle Block between the vector and the scalar.
742   BasicBlock *LoopMiddleBlock;
743 
744   /// The unique ExitBlock of the scalar loop if one exists.  Note that
745   /// there can be multiple exiting edges reaching this block.
746   BasicBlock *LoopExitBlock;
747 
748   /// The scalar loop body.
749   BasicBlock *LoopScalarBody;
750 
751   /// A list of all bypass blocks. The first block is the entry of the loop.
752   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
753 
754   /// Store instructions that were predicated.
755   SmallVector<Instruction *, 4> PredicatedInstructions;
756 
757   /// Trip count of the original loop.
758   Value *TripCount = nullptr;
759 
760   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
761   Value *VectorTripCount = nullptr;
762 
763   /// The legality analysis.
764   LoopVectorizationLegality *Legal;
765 
766   /// The profitablity analysis.
767   LoopVectorizationCostModel *Cost;
768 
769   // Record whether runtime checks are added.
770   bool AddedSafetyChecks = false;
771 
772   // Holds the end values for each induction variable. We save the end values
773   // so we can later fix-up the external users of the induction variables.
774   DenseMap<PHINode *, Value *> IVEndValues;
775 
776   /// BFI and PSI are used to check for profile guided size optimizations.
777   BlockFrequencyInfo *BFI;
778   ProfileSummaryInfo *PSI;
779 
780   // Whether this loop should be optimized for size based on profile guided size
781   // optimizatios.
782   bool OptForSizeBasedOnProfile;
783 
784   /// Structure to hold information about generated runtime checks, responsible
785   /// for cleaning the checks, if vectorization turns out unprofitable.
786   GeneratedRTChecks &RTChecks;
787 
788   // Holds the resume values for reductions in the loops, used to set the
789   // correct start value of reduction PHIs when vectorizing the epilogue.
790   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
791       ReductionResumeValues;
792 };
793 
794 class InnerLoopUnroller : public InnerLoopVectorizer {
795 public:
796   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
797                     LoopInfo *LI, DominatorTree *DT,
798                     const TargetLibraryInfo *TLI,
799                     const TargetTransformInfo *TTI, AssumptionCache *AC,
800                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
801                     LoopVectorizationLegality *LVL,
802                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
803                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
804       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
805                             ElementCount::getFixed(1),
806                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
807                             BFI, PSI, Check) {}
808 };
809 
810 /// Encapsulate information regarding vectorization of a loop and its epilogue.
811 /// This information is meant to be updated and used across two stages of
812 /// epilogue vectorization.
813 struct EpilogueLoopVectorizationInfo {
814   ElementCount MainLoopVF = ElementCount::getFixed(0);
815   unsigned MainLoopUF = 0;
816   ElementCount EpilogueVF = ElementCount::getFixed(0);
817   unsigned EpilogueUF = 0;
818   BasicBlock *MainLoopIterationCountCheck = nullptr;
819   BasicBlock *EpilogueIterationCountCheck = nullptr;
820   BasicBlock *SCEVSafetyCheck = nullptr;
821   BasicBlock *MemSafetyCheck = nullptr;
822   Value *TripCount = nullptr;
823   Value *VectorTripCount = nullptr;
824 
825   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
826                                 ElementCount EVF, unsigned EUF)
827       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
828     assert(EUF == 1 &&
829            "A high UF for the epilogue loop is likely not beneficial.");
830   }
831 };
832 
833 /// An extension of the inner loop vectorizer that creates a skeleton for a
834 /// vectorized loop that has its epilogue (residual) also vectorized.
835 /// The idea is to run the vplan on a given loop twice, firstly to setup the
836 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
837 /// from the first step and vectorize the epilogue.  This is achieved by
838 /// deriving two concrete strategy classes from this base class and invoking
839 /// them in succession from the loop vectorizer planner.
840 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
841 public:
842   InnerLoopAndEpilogueVectorizer(
843       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
844       DominatorTree *DT, const TargetLibraryInfo *TLI,
845       const TargetTransformInfo *TTI, AssumptionCache *AC,
846       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
847       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
848       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
849       GeneratedRTChecks &Checks)
850       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
851                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
852                             CM, BFI, PSI, Checks),
853         EPI(EPI) {}
854 
855   // Override this function to handle the more complex control flow around the
856   // three loops.
857   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
858       const SCEV2ValueTy &ExpandedSCEVs) final {
859     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
860   }
861 
862   /// The interface for creating a vectorized skeleton using one of two
863   /// different strategies, each corresponding to one execution of the vplan
864   /// as described above.
865   virtual std::pair<BasicBlock *, Value *>
866   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
867 
868   /// Holds and updates state information required to vectorize the main loop
869   /// and its epilogue in two separate passes. This setup helps us avoid
870   /// regenerating and recomputing runtime safety checks. It also helps us to
871   /// shorten the iteration-count-check path length for the cases where the
872   /// iteration count of the loop is so small that the main vector loop is
873   /// completely skipped.
874   EpilogueLoopVectorizationInfo &EPI;
875 };
876 
877 /// A specialized derived class of inner loop vectorizer that performs
878 /// vectorization of *main* loops in the process of vectorizing loops and their
879 /// epilogues.
880 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
881 public:
882   EpilogueVectorizerMainLoop(
883       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
884       DominatorTree *DT, const TargetLibraryInfo *TLI,
885       const TargetTransformInfo *TTI, AssumptionCache *AC,
886       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
887       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
888       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
889       GeneratedRTChecks &Check)
890       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
891                                        EPI, LVL, CM, BFI, PSI, Check) {}
892   /// Implements the interface for creating a vectorized skeleton using the
893   /// *main loop* strategy (ie the first pass of vplan execution).
894   std::pair<BasicBlock *, Value *>
895   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
896 
897 protected:
898   /// Emits an iteration count bypass check once for the main loop (when \p
899   /// ForEpilogue is false) and once for the epilogue loop (when \p
900   /// ForEpilogue is true).
901   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
902   void printDebugTracesAtStart() override;
903   void printDebugTracesAtEnd() override;
904 };
905 
906 // A specialized derived class of inner loop vectorizer that performs
907 // vectorization of *epilogue* loops in the process of vectorizing loops and
908 // their epilogues.
909 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
910 public:
911   EpilogueVectorizerEpilogueLoop(
912       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
913       DominatorTree *DT, const TargetLibraryInfo *TLI,
914       const TargetTransformInfo *TTI, AssumptionCache *AC,
915       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
916       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
917       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
918       GeneratedRTChecks &Checks)
919       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
920                                        EPI, LVL, CM, BFI, PSI, Checks) {
921     TripCount = EPI.TripCount;
922   }
923   /// Implements the interface for creating a vectorized skeleton using the
924   /// *epilogue loop* strategy (ie the second pass of vplan execution).
925   std::pair<BasicBlock *, Value *>
926   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
927 
928 protected:
929   /// Emits an iteration count bypass check after the main vector loop has
930   /// finished to see if there are any iterations left to execute by either
931   /// the vector epilogue or the scalar epilogue.
932   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
933                                                       BasicBlock *Bypass,
934                                                       BasicBlock *Insert);
935   void printDebugTracesAtStart() override;
936   void printDebugTracesAtEnd() override;
937 };
938 } // end namespace llvm
939 
940 /// Look for a meaningful debug location on the instruction or it's
941 /// operands.
942 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
943   if (!I)
944     return DebugLoc();
945 
946   DebugLoc Empty;
947   if (I->getDebugLoc() != Empty)
948     return I->getDebugLoc();
949 
950   for (Use &Op : I->operands()) {
951     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
952       if (OpInst->getDebugLoc() != Empty)
953         return OpInst->getDebugLoc();
954   }
955 
956   return I->getDebugLoc();
957 }
958 
959 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
960 /// is passed, the message relates to that particular instruction.
961 #ifndef NDEBUG
962 static void debugVectorizationMessage(const StringRef Prefix,
963                                       const StringRef DebugMsg,
964                                       Instruction *I) {
965   dbgs() << "LV: " << Prefix << DebugMsg;
966   if (I != nullptr)
967     dbgs() << " " << *I;
968   else
969     dbgs() << '.';
970   dbgs() << '\n';
971 }
972 #endif
973 
974 /// Create an analysis remark that explains why vectorization failed
975 ///
976 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
977 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
978 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
979 /// the location of the remark.  \return the remark object that can be
980 /// streamed to.
981 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
982     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
983   Value *CodeRegion = TheLoop->getHeader();
984   DebugLoc DL = TheLoop->getStartLoc();
985 
986   if (I) {
987     CodeRegion = I->getParent();
988     // If there is no debug location attached to the instruction, revert back to
989     // using the loop's.
990     if (I->getDebugLoc())
991       DL = I->getDebugLoc();
992   }
993 
994   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
995 }
996 
997 namespace llvm {
998 
999 /// Return a value for Step multiplied by VF.
1000 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1001                        int64_t Step) {
1002   assert(Ty->isIntegerTy() && "Expected an integer step");
1003   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
1004 }
1005 
1006 /// Return the runtime value for VF.
1007 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1008   return B.CreateElementCount(Ty, VF);
1009 }
1010 
1011 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
1012                                 Loop *OrigLoop) {
1013   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
1014   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
1015 
1016   ScalarEvolution &SE = *PSE.getSE();
1017   return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
1018 }
1019 
1020 void reportVectorizationFailure(const StringRef DebugMsg,
1021                                 const StringRef OREMsg, const StringRef ORETag,
1022                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1023                                 Instruction *I) {
1024   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1025   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1026   ORE->emit(
1027       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1028       << "loop not vectorized: " << OREMsg);
1029 }
1030 
1031 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1032                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1033                              Instruction *I) {
1034   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1035   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1036   ORE->emit(
1037       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1038       << Msg);
1039 }
1040 
1041 /// Report successful vectorization of the loop. In case an outer loop is
1042 /// vectorized, prepend "outer" to the vectorization remark.
1043 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1044                                 VectorizationFactor VF, unsigned IC) {
1045   LLVM_DEBUG(debugVectorizationMessage(
1046       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1047       nullptr));
1048   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1049   ORE->emit([&]() {
1050     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1051                               TheLoop->getHeader())
1052            << "vectorized " << LoopType << "loop (vectorization width: "
1053            << ore::NV("VectorizationFactor", VF.Width)
1054            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1055   });
1056 }
1057 
1058 } // end namespace llvm
1059 
1060 #ifndef NDEBUG
1061 /// \return string containing a file name and a line # for the given loop.
1062 static std::string getDebugLocString(const Loop *L) {
1063   std::string Result;
1064   if (L) {
1065     raw_string_ostream OS(Result);
1066     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1067       LoopDbgLoc.print(OS);
1068     else
1069       // Just print the module name.
1070       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1071     OS.flush();
1072   }
1073   return Result;
1074 }
1075 #endif
1076 
1077 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1078     VPTransformState &State) {
1079 
1080   // Collect recipes in the backward slice of `Root` that may generate a poison
1081   // value that is used after vectorization.
1082   SmallPtrSet<VPRecipeBase *, 16> Visited;
1083   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1084     SmallVector<VPRecipeBase *, 16> Worklist;
1085     Worklist.push_back(Root);
1086 
1087     // Traverse the backward slice of Root through its use-def chain.
1088     while (!Worklist.empty()) {
1089       VPRecipeBase *CurRec = Worklist.back();
1090       Worklist.pop_back();
1091 
1092       if (!Visited.insert(CurRec).second)
1093         continue;
1094 
1095       // Prune search if we find another recipe generating a widen memory
1096       // instruction. Widen memory instructions involved in address computation
1097       // will lead to gather/scatter instructions, which don't need to be
1098       // handled.
1099       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1100           isa<VPInterleaveRecipe>(CurRec) ||
1101           isa<VPScalarIVStepsRecipe>(CurRec) ||
1102           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1103           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1104         continue;
1105 
1106       // This recipe contributes to the address computation of a widen
1107       // load/store. If the underlying instruction has poison-generating flags,
1108       // drop them directly.
1109       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1110         RecWithFlags->dropPoisonGeneratingFlags();
1111       } else {
1112         Instruction *Instr = dyn_cast_or_null<Instruction>(
1113             CurRec->getVPSingleValue()->getUnderlyingValue());
1114         (void)Instr;
1115         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1116                "found instruction with poison generating flags not covered by "
1117                "VPRecipeWithIRFlags");
1118       }
1119 
1120       // Add new definitions to the worklist.
1121       for (VPValue *operand : CurRec->operands())
1122         if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1123           Worklist.push_back(OpDef);
1124     }
1125   });
1126 
1127   // Traverse all the recipes in the VPlan and collect the poison-generating
1128   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1129   // VPInterleaveRecipe.
1130   auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1131   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1132     for (VPRecipeBase &Recipe : *VPBB) {
1133       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1134         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1135         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1136         if (AddrDef && WidenRec->isConsecutive() &&
1137             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1138           collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1139       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1140         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1141         if (AddrDef) {
1142           // Check if any member of the interleave group needs predication.
1143           const InterleaveGroup<Instruction> *InterGroup =
1144               InterleaveRec->getInterleaveGroup();
1145           bool NeedPredication = false;
1146           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1147                I < NumMembers; ++I) {
1148             Instruction *Member = InterGroup->getMember(I);
1149             if (Member)
1150               NeedPredication |=
1151                   Legal->blockNeedsPredication(Member->getParent());
1152           }
1153 
1154           if (NeedPredication)
1155             collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1156         }
1157       }
1158     }
1159   }
1160 }
1161 
1162 namespace llvm {
1163 
1164 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1165 // lowered.
1166 enum ScalarEpilogueLowering {
1167 
1168   // The default: allowing scalar epilogues.
1169   CM_ScalarEpilogueAllowed,
1170 
1171   // Vectorization with OptForSize: don't allow epilogues.
1172   CM_ScalarEpilogueNotAllowedOptSize,
1173 
1174   // A special case of vectorisation with OptForSize: loops with a very small
1175   // trip count are considered for vectorization under OptForSize, thereby
1176   // making sure the cost of their loop body is dominant, free of runtime
1177   // guards and scalar iteration overheads.
1178   CM_ScalarEpilogueNotAllowedLowTripLoop,
1179 
1180   // Loop hint predicate indicating an epilogue is undesired.
1181   CM_ScalarEpilogueNotNeededUsePredicate,
1182 
1183   // Directive indicating we must either tail fold or not vectorize
1184   CM_ScalarEpilogueNotAllowedUsePredicate
1185 };
1186 
1187 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1188 
1189 /// LoopVectorizationCostModel - estimates the expected speedups due to
1190 /// vectorization.
1191 /// In many cases vectorization is not profitable. This can happen because of
1192 /// a number of reasons. In this class we mainly attempt to predict the
1193 /// expected speedup/slowdowns due to the supported instruction set. We use the
1194 /// TargetTransformInfo to query the different backends for the cost of
1195 /// different operations.
1196 class LoopVectorizationCostModel {
1197 public:
1198   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1199                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1200                              LoopVectorizationLegality *Legal,
1201                              const TargetTransformInfo &TTI,
1202                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1203                              AssumptionCache *AC,
1204                              OptimizationRemarkEmitter *ORE, const Function *F,
1205                              const LoopVectorizeHints *Hints,
1206                              InterleavedAccessInfo &IAI)
1207       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1208         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1209         Hints(Hints), InterleaveInfo(IAI) {}
1210 
1211   /// \return An upper bound for the vectorization factors (both fixed and
1212   /// scalable). If the factors are 0, vectorization and interleaving should be
1213   /// avoided up front.
1214   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1215 
1216   /// \return True if runtime checks are required for vectorization, and false
1217   /// otherwise.
1218   bool runtimeChecksRequired();
1219 
1220   /// Setup cost-based decisions for user vectorization factor.
1221   /// \return true if the UserVF is a feasible VF to be chosen.
1222   bool selectUserVectorizationFactor(ElementCount UserVF) {
1223     collectUniformsAndScalars(UserVF);
1224     collectInstsToScalarize(UserVF);
1225     return expectedCost(UserVF).first.isValid();
1226   }
1227 
1228   /// \return The size (in bits) of the smallest and widest types in the code
1229   /// that needs to be vectorized. We ignore values that remain scalar such as
1230   /// 64 bit loop indices.
1231   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1232 
1233   /// \return The desired interleave count.
1234   /// If interleave count has been specified by metadata it will be returned.
1235   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1236   /// are the selected vectorization factor and the cost of the selected VF.
1237   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1238 
1239   /// Memory access instruction may be vectorized in more than one way.
1240   /// Form of instruction after vectorization depends on cost.
1241   /// This function takes cost-based decisions for Load/Store instructions
1242   /// and collects them in a map. This decisions map is used for building
1243   /// the lists of loop-uniform and loop-scalar instructions.
1244   /// The calculated cost is saved with widening decision in order to
1245   /// avoid redundant calculations.
1246   void setCostBasedWideningDecision(ElementCount VF);
1247 
1248   /// A call may be vectorized in different ways depending on whether we have
1249   /// vectorized variants available and whether the target supports masking.
1250   /// This function analyzes all calls in the function at the supplied VF,
1251   /// makes a decision based on the costs of available options, and stores that
1252   /// decision in a map for use in planning and plan execution.
1253   void setVectorizedCallDecision(ElementCount VF);
1254 
1255   /// A struct that represents some properties of the register usage
1256   /// of a loop.
1257   struct RegisterUsage {
1258     /// Holds the number of loop invariant values that are used in the loop.
1259     /// The key is ClassID of target-provided register class.
1260     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1261     /// Holds the maximum number of concurrent live intervals in the loop.
1262     /// The key is ClassID of target-provided register class.
1263     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1264   };
1265 
1266   /// \return Returns information about the register usages of the loop for the
1267   /// given vectorization factors.
1268   SmallVector<RegisterUsage, 8>
1269   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1270 
1271   /// Collect values we want to ignore in the cost model.
1272   void collectValuesToIgnore();
1273 
1274   /// Collect all element types in the loop for which widening is needed.
1275   void collectElementTypesForWidening();
1276 
1277   /// Split reductions into those that happen in the loop, and those that happen
1278   /// outside. In loop reductions are collected into InLoopReductions.
1279   void collectInLoopReductions();
1280 
1281   /// Returns true if we should use strict in-order reductions for the given
1282   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1283   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1284   /// of FP operations.
1285   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1286     return !Hints->allowReordering() && RdxDesc.isOrdered();
1287   }
1288 
1289   /// \returns The smallest bitwidth each instruction can be represented with.
1290   /// The vector equivalents of these instructions should be truncated to this
1291   /// type.
1292   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1293     return MinBWs;
1294   }
1295 
1296   /// \returns True if it is more profitable to scalarize instruction \p I for
1297   /// vectorization factor \p VF.
1298   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1299     assert(VF.isVector() &&
1300            "Profitable to scalarize relevant only for VF > 1.");
1301 
1302     // Cost model is not run in the VPlan-native path - return conservative
1303     // result until this changes.
1304     if (EnableVPlanNativePath)
1305       return false;
1306 
1307     auto Scalars = InstsToScalarize.find(VF);
1308     assert(Scalars != InstsToScalarize.end() &&
1309            "VF not yet analyzed for scalarization profitability");
1310     return Scalars->second.contains(I);
1311   }
1312 
1313   /// Returns true if \p I is known to be uniform after vectorization.
1314   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1315     // Pseudo probe needs to be duplicated for each unrolled iteration and
1316     // vector lane so that profiled loop trip count can be accurately
1317     // accumulated instead of being under counted.
1318     if (isa<PseudoProbeInst>(I))
1319       return false;
1320 
1321     if (VF.isScalar())
1322       return true;
1323 
1324     // Cost model is not run in the VPlan-native path - return conservative
1325     // result until this changes.
1326     if (EnableVPlanNativePath)
1327       return false;
1328 
1329     auto UniformsPerVF = Uniforms.find(VF);
1330     assert(UniformsPerVF != Uniforms.end() &&
1331            "VF not yet analyzed for uniformity");
1332     return UniformsPerVF->second.count(I);
1333   }
1334 
1335   /// Returns true if \p I is known to be scalar after vectorization.
1336   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1337     if (VF.isScalar())
1338       return true;
1339 
1340     // Cost model is not run in the VPlan-native path - return conservative
1341     // result until this changes.
1342     if (EnableVPlanNativePath)
1343       return false;
1344 
1345     auto ScalarsPerVF = Scalars.find(VF);
1346     assert(ScalarsPerVF != Scalars.end() &&
1347            "Scalar values are not calculated for VF");
1348     return ScalarsPerVF->second.count(I);
1349   }
1350 
1351   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1352   /// for vectorization factor \p VF.
1353   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1354     return VF.isVector() && MinBWs.contains(I) &&
1355            !isProfitableToScalarize(I, VF) &&
1356            !isScalarAfterVectorization(I, VF);
1357   }
1358 
1359   /// Decision that was taken during cost calculation for memory instruction.
1360   enum InstWidening {
1361     CM_Unknown,
1362     CM_Widen,         // For consecutive accesses with stride +1.
1363     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1364     CM_Interleave,
1365     CM_GatherScatter,
1366     CM_Scalarize,
1367     CM_VectorCall,
1368     CM_IntrinsicCall
1369   };
1370 
1371   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1372   /// instruction \p I and vector width \p VF.
1373   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1374                            InstructionCost Cost) {
1375     assert(VF.isVector() && "Expected VF >=2");
1376     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1377   }
1378 
1379   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1380   /// interleaving group \p Grp and vector width \p VF.
1381   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1382                            ElementCount VF, InstWidening W,
1383                            InstructionCost Cost) {
1384     assert(VF.isVector() && "Expected VF >=2");
1385     /// Broadcast this decicion to all instructions inside the group.
1386     /// But the cost will be assigned to one instruction only.
1387     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1388       if (auto *I = Grp->getMember(i)) {
1389         if (Grp->getInsertPos() == I)
1390           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1391         else
1392           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1393       }
1394     }
1395   }
1396 
1397   /// Return the cost model decision for the given instruction \p I and vector
1398   /// width \p VF. Return CM_Unknown if this instruction did not pass
1399   /// through the cost modeling.
1400   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1401     assert(VF.isVector() && "Expected VF to be a vector VF");
1402     // Cost model is not run in the VPlan-native path - return conservative
1403     // result until this changes.
1404     if (EnableVPlanNativePath)
1405       return CM_GatherScatter;
1406 
1407     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1408     auto Itr = WideningDecisions.find(InstOnVF);
1409     if (Itr == WideningDecisions.end())
1410       return CM_Unknown;
1411     return Itr->second.first;
1412   }
1413 
1414   /// Return the vectorization cost for the given instruction \p I and vector
1415   /// width \p VF.
1416   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1417     assert(VF.isVector() && "Expected VF >=2");
1418     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1419     assert(WideningDecisions.contains(InstOnVF) &&
1420            "The cost is not calculated");
1421     return WideningDecisions[InstOnVF].second;
1422   }
1423 
1424   struct CallWideningDecision {
1425     InstWidening Kind;
1426     Function *Variant;
1427     Intrinsic::ID IID;
1428     std::optional<unsigned> MaskPos;
1429     InstructionCost Cost;
1430   };
1431 
1432   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1433                                Function *Variant, Intrinsic::ID IID,
1434                                std::optional<unsigned> MaskPos,
1435                                InstructionCost Cost) {
1436     assert(!VF.isScalar() && "Expected vector VF");
1437     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1438                                                      MaskPos, Cost};
1439   }
1440 
1441   CallWideningDecision getCallWideningDecision(CallInst *CI,
1442                                                ElementCount VF) const {
1443     assert(!VF.isScalar() && "Expected vector VF");
1444     return CallWideningDecisions.at(std::make_pair(CI, VF));
1445   }
1446 
1447   /// Return True if instruction \p I is an optimizable truncate whose operand
1448   /// is an induction variable. Such a truncate will be removed by adding a new
1449   /// induction variable with the destination type.
1450   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1451     // If the instruction is not a truncate, return false.
1452     auto *Trunc = dyn_cast<TruncInst>(I);
1453     if (!Trunc)
1454       return false;
1455 
1456     // Get the source and destination types of the truncate.
1457     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1458     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1459 
1460     // If the truncate is free for the given types, return false. Replacing a
1461     // free truncate with an induction variable would add an induction variable
1462     // update instruction to each iteration of the loop. We exclude from this
1463     // check the primary induction variable since it will need an update
1464     // instruction regardless.
1465     Value *Op = Trunc->getOperand(0);
1466     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1467       return false;
1468 
1469     // If the truncated value is not an induction variable, return false.
1470     return Legal->isInductionPhi(Op);
1471   }
1472 
1473   /// Collects the instructions to scalarize for each predicated instruction in
1474   /// the loop.
1475   void collectInstsToScalarize(ElementCount VF);
1476 
1477   /// Collect Uniform and Scalar values for the given \p VF.
1478   /// The sets depend on CM decision for Load/Store instructions
1479   /// that may be vectorized as interleave, gather-scatter or scalarized.
1480   /// Also make a decision on what to do about call instructions in the loop
1481   /// at that VF -- scalarize, call a known vector routine, or call a
1482   /// vector intrinsic.
1483   void collectUniformsAndScalars(ElementCount VF) {
1484     // Do the analysis once.
1485     if (VF.isScalar() || Uniforms.contains(VF))
1486       return;
1487     setCostBasedWideningDecision(VF);
1488     setVectorizedCallDecision(VF);
1489     collectLoopUniforms(VF);
1490     collectLoopScalars(VF);
1491   }
1492 
1493   /// Returns true if the target machine supports masked store operation
1494   /// for the given \p DataType and kind of access to \p Ptr.
1495   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1496     return Legal->isConsecutivePtr(DataType, Ptr) &&
1497            TTI.isLegalMaskedStore(DataType, Alignment);
1498   }
1499 
1500   /// Returns true if the target machine supports masked load operation
1501   /// for the given \p DataType and kind of access to \p Ptr.
1502   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1503     return Legal->isConsecutivePtr(DataType, Ptr) &&
1504            TTI.isLegalMaskedLoad(DataType, Alignment);
1505   }
1506 
1507   /// Returns true if the target machine can represent \p V as a masked gather
1508   /// or scatter operation.
1509   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1510     bool LI = isa<LoadInst>(V);
1511     bool SI = isa<StoreInst>(V);
1512     if (!LI && !SI)
1513       return false;
1514     auto *Ty = getLoadStoreType(V);
1515     Align Align = getLoadStoreAlignment(V);
1516     if (VF.isVector())
1517       Ty = VectorType::get(Ty, VF);
1518     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1519            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1520   }
1521 
1522   /// Returns true if the target machine supports all of the reduction
1523   /// variables found for the given VF.
1524   bool canVectorizeReductions(ElementCount VF) const {
1525     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1526       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1528     }));
1529   }
1530 
1531   /// Given costs for both strategies, return true if the scalar predication
1532   /// lowering should be used for div/rem.  This incorporates an override
1533   /// option so it is not simply a cost comparison.
1534   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1535                                      InstructionCost SafeDivisorCost) const {
1536     switch (ForceSafeDivisor) {
1537     case cl::BOU_UNSET:
1538       return ScalarCost < SafeDivisorCost;
1539     case cl::BOU_TRUE:
1540       return false;
1541     case cl::BOU_FALSE:
1542       return true;
1543     };
1544     llvm_unreachable("impossible case value");
1545   }
1546 
1547   /// Returns true if \p I is an instruction which requires predication and
1548   /// for which our chosen predication strategy is scalarization (i.e. we
1549   /// don't have an alternate strategy such as masking available).
1550   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1551   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1552 
1553   /// Returns true if \p I is an instruction that needs to be predicated
1554   /// at runtime.  The result is independent of the predication mechanism.
1555   /// Superset of instructions that return true for isScalarWithPredication.
1556   bool isPredicatedInst(Instruction *I) const;
1557 
1558   /// Return the costs for our two available strategies for lowering a
1559   /// div/rem operation which requires speculating at least one lane.
1560   /// First result is for scalarization (will be invalid for scalable
1561   /// vectors); second is for the safe-divisor strategy.
1562   std::pair<InstructionCost, InstructionCost>
1563   getDivRemSpeculationCost(Instruction *I,
1564                            ElementCount VF) const;
1565 
1566   /// Returns true if \p I is a memory instruction with consecutive memory
1567   /// access that can be widened.
1568   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1569 
1570   /// Returns true if \p I is a memory instruction in an interleaved-group
1571   /// of memory accesses that can be vectorized with wide vector loads/stores
1572   /// and shuffles.
1573   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1574 
1575   /// Check if \p Instr belongs to any interleaved access group.
1576   bool isAccessInterleaved(Instruction *Instr) {
1577     return InterleaveInfo.isInterleaved(Instr);
1578   }
1579 
1580   /// Get the interleaved access group that \p Instr belongs to.
1581   const InterleaveGroup<Instruction> *
1582   getInterleavedAccessGroup(Instruction *Instr) {
1583     return InterleaveInfo.getInterleaveGroup(Instr);
1584   }
1585 
1586   /// Returns true if we're required to use a scalar epilogue for at least
1587   /// the final iteration of the original loop.
1588   bool requiresScalarEpilogue(bool IsVectorizing) const {
1589     if (!isScalarEpilogueAllowed())
1590       return false;
1591     // If we might exit from anywhere but the latch, must run the exiting
1592     // iteration in scalar form.
1593     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1594       return true;
1595     return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1596   }
1597 
1598   /// Returns true if we're required to use a scalar epilogue for at least
1599   /// the final iteration of the original loop for all VFs in \p Range.
1600   /// A scalar epilogue must either be required for all VFs in \p Range or for
1601   /// none.
1602   bool requiresScalarEpilogue(VFRange Range) const {
1603     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1604       return requiresScalarEpilogue(VF.isVector());
1605     };
1606     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1607     assert(
1608         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1609         "all VFs in range must agree on whether a scalar epilogue is required");
1610     return IsRequired;
1611   }
1612 
1613   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1614   /// loop hint annotation.
1615   bool isScalarEpilogueAllowed() const {
1616     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1617   }
1618 
1619   /// Returns the TailFoldingStyle that is best for the current loop.
1620   TailFoldingStyle
1621   getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1622     if (!CanFoldTailByMasking)
1623       return TailFoldingStyle::None;
1624 
1625     if (ForceTailFoldingStyle.getNumOccurrences())
1626       return ForceTailFoldingStyle;
1627 
1628     return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
1629   }
1630 
1631   /// Returns true if all loop blocks should be masked to fold tail loop.
1632   bool foldTailByMasking() const {
1633     return getTailFoldingStyle() != TailFoldingStyle::None;
1634   }
1635 
1636   /// Returns true if the instructions in this block requires predication
1637   /// for any reason, e.g. because tail folding now requires a predicate
1638   /// or because the block in the original loop was predicated.
1639   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1640     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1641   }
1642 
1643   /// Returns true if the Phi is part of an inloop reduction.
1644   bool isInLoopReduction(PHINode *Phi) const {
1645     return InLoopReductions.contains(Phi);
1646   }
1647 
1648   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1649   /// with factor VF.  Return the cost of the instruction, including
1650   /// scalarization overhead if it's needed.
1651   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1652 
1653   /// Estimate cost of a call instruction CI if it were vectorized with factor
1654   /// VF. Return the cost of the instruction, including scalarization overhead
1655   /// if it's needed.
1656   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1657 
1658   /// Invalidates decisions already taken by the cost model.
1659   void invalidateCostModelingDecisions() {
1660     WideningDecisions.clear();
1661     CallWideningDecisions.clear();
1662     Uniforms.clear();
1663     Scalars.clear();
1664   }
1665 
1666   /// The vectorization cost is a combination of the cost itself and a boolean
1667   /// indicating whether any of the contributing operations will actually
1668   /// operate on vector values after type legalization in the backend. If this
1669   /// latter value is false, then all operations will be scalarized (i.e. no
1670   /// vectorization has actually taken place).
1671   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1672 
1673   /// Returns the expected execution cost. The unit of the cost does
1674   /// not matter because we use the 'cost' units to compare different
1675   /// vector widths. The cost that is returned is *not* normalized by
1676   /// the factor width. If \p Invalid is not nullptr, this function
1677   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678   /// each instruction that has an Invalid cost for the given VF.
1679   VectorizationCostTy
1680   expectedCost(ElementCount VF,
1681                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1682 
1683   bool hasPredStores() const { return NumPredStores > 0; }
1684 
1685   /// Returns true if epilogue vectorization is considered profitable, and
1686   /// false otherwise.
1687   /// \p VF is the vectorization factor chosen for the original loop.
1688   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1689 
1690 private:
1691   unsigned NumPredStores = 0;
1692 
1693   /// \return An upper bound for the vectorization factors for both
1694   /// fixed and scalable vectorization, where the minimum-known number of
1695   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1696   /// disabled or unsupported, then the scalable part will be equal to
1697   /// ElementCount::getScalable(0).
1698   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1699                                            ElementCount UserVF,
1700                                            bool FoldTailByMasking);
1701 
1702   /// \return the maximized element count based on the targets vector
1703   /// registers and the loop trip-count, but limited to a maximum safe VF.
1704   /// This is a helper function of computeFeasibleMaxVF.
1705   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1706                                        unsigned SmallestType,
1707                                        unsigned WidestType,
1708                                        ElementCount MaxSafeVF,
1709                                        bool FoldTailByMasking);
1710 
1711   /// \return the maximum legal scalable VF, based on the safe max number
1712   /// of elements.
1713   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1714 
1715   /// Returns the execution time cost of an instruction for a given vector
1716   /// width. Vector width of one means scalar.
1717   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1718 
1719   /// The cost-computation logic from getInstructionCost which provides
1720   /// the vector type as an output parameter.
1721   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1722                                      Type *&VectorTy);
1723 
1724   /// Return the cost of instructions in an inloop reduction pattern, if I is
1725   /// part of that pattern.
1726   std::optional<InstructionCost>
1727   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1728                           TTI::TargetCostKind CostKind) const;
1729 
1730   /// Calculate vectorization cost of memory instruction \p I.
1731   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1732 
1733   /// The cost computation for scalarized memory instruction.
1734   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1735 
1736   /// The cost computation for interleaving group of memory instructions.
1737   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1738 
1739   /// The cost computation for Gather/Scatter instruction.
1740   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1741 
1742   /// The cost computation for widening instruction \p I with consecutive
1743   /// memory access.
1744   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1745 
1746   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1747   /// Load: scalar load + broadcast.
1748   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1749   /// element)
1750   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1751 
1752   /// Estimate the overhead of scalarizing an instruction. This is a
1753   /// convenience wrapper for the type-based getScalarizationOverhead API.
1754   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1755                                            TTI::TargetCostKind CostKind) const;
1756 
1757   /// Returns true if an artificially high cost for emulated masked memrefs
1758   /// should be used.
1759   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1760 
1761   /// Map of scalar integer values to the smallest bitwidth they can be legally
1762   /// represented as. The vector equivalents of these values should be truncated
1763   /// to this type.
1764   MapVector<Instruction *, uint64_t> MinBWs;
1765 
1766   /// A type representing the costs for instructions if they were to be
1767   /// scalarized rather than vectorized. The entries are Instruction-Cost
1768   /// pairs.
1769   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1770 
1771   /// A set containing all BasicBlocks that are known to present after
1772   /// vectorization as a predicated block.
1773   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1774       PredicatedBBsAfterVectorization;
1775 
1776   /// Records whether it is allowed to have the original scalar loop execute at
1777   /// least once. This may be needed as a fallback loop in case runtime
1778   /// aliasing/dependence checks fail, or to handle the tail/remainder
1779   /// iterations when the trip count is unknown or doesn't divide by the VF,
1780   /// or as a peel-loop to handle gaps in interleave-groups.
1781   /// Under optsize and when the trip count is very small we don't allow any
1782   /// iterations to execute in the scalar loop.
1783   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1784 
1785   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1786   bool CanFoldTailByMasking = false;
1787 
1788   /// A map holding scalar costs for different vectorization factors. The
1789   /// presence of a cost for an instruction in the mapping indicates that the
1790   /// instruction will be scalarized when vectorizing with the associated
1791   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1792   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1793 
1794   /// Holds the instructions known to be uniform after vectorization.
1795   /// The data is collected per VF.
1796   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1797 
1798   /// Holds the instructions known to be scalar after vectorization.
1799   /// The data is collected per VF.
1800   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1801 
1802   /// Holds the instructions (address computations) that are forced to be
1803   /// scalarized.
1804   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1805 
1806   /// PHINodes of the reductions that should be expanded in-loop.
1807   SmallPtrSet<PHINode *, 4> InLoopReductions;
1808 
1809   /// A Map of inloop reduction operations and their immediate chain operand.
1810   /// FIXME: This can be removed once reductions can be costed correctly in
1811   /// VPlan. This was added to allow quick lookup of the inloop operations.
1812   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1813 
1814   /// Returns the expected difference in cost from scalarizing the expression
1815   /// feeding a predicated instruction \p PredInst. The instructions to
1816   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1817   /// non-negative return value implies the expression will be scalarized.
1818   /// Currently, only single-use chains are considered for scalarization.
1819   InstructionCost computePredInstDiscount(Instruction *PredInst,
1820                                           ScalarCostsTy &ScalarCosts,
1821                                           ElementCount VF);
1822 
1823   /// Collect the instructions that are uniform after vectorization. An
1824   /// instruction is uniform if we represent it with a single scalar value in
1825   /// the vectorized loop corresponding to each vector iteration. Examples of
1826   /// uniform instructions include pointer operands of consecutive or
1827   /// interleaved memory accesses. Note that although uniformity implies an
1828   /// instruction will be scalar, the reverse is not true. In general, a
1829   /// scalarized instruction will be represented by VF scalar values in the
1830   /// vectorized loop, each corresponding to an iteration of the original
1831   /// scalar loop.
1832   void collectLoopUniforms(ElementCount VF);
1833 
1834   /// Collect the instructions that are scalar after vectorization. An
1835   /// instruction is scalar if it is known to be uniform or will be scalarized
1836   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1837   /// to the list if they are used by a load/store instruction that is marked as
1838   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1839   /// VF values in the vectorized loop, each corresponding to an iteration of
1840   /// the original scalar loop.
1841   void collectLoopScalars(ElementCount VF);
1842 
1843   /// Keeps cost model vectorization decision and cost for instructions.
1844   /// Right now it is used for memory instructions only.
1845   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1846                                 std::pair<InstWidening, InstructionCost>>;
1847 
1848   DecisionList WideningDecisions;
1849 
1850   using CallDecisionList =
1851       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1852 
1853   CallDecisionList CallWideningDecisions;
1854 
1855   /// Returns true if \p V is expected to be vectorized and it needs to be
1856   /// extracted.
1857   bool needsExtract(Value *V, ElementCount VF) const {
1858     Instruction *I = dyn_cast<Instruction>(V);
1859     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1860         TheLoop->isLoopInvariant(I))
1861       return false;
1862 
1863     // Assume we can vectorize V (and hence we need extraction) if the
1864     // scalars are not computed yet. This can happen, because it is called
1865     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1866     // the scalars are collected. That should be a safe assumption in most
1867     // cases, because we check if the operands have vectorizable types
1868     // beforehand in LoopVectorizationLegality.
1869     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1870   };
1871 
1872   /// Returns a range containing only operands needing to be extracted.
1873   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1874                                                    ElementCount VF) const {
1875     return SmallVector<Value *, 4>(make_filter_range(
1876         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1877   }
1878 
1879 public:
1880   /// The loop that we evaluate.
1881   Loop *TheLoop;
1882 
1883   /// Predicated scalar evolution analysis.
1884   PredicatedScalarEvolution &PSE;
1885 
1886   /// Loop Info analysis.
1887   LoopInfo *LI;
1888 
1889   /// Vectorization legality.
1890   LoopVectorizationLegality *Legal;
1891 
1892   /// Vector target information.
1893   const TargetTransformInfo &TTI;
1894 
1895   /// Target Library Info.
1896   const TargetLibraryInfo *TLI;
1897 
1898   /// Demanded bits analysis.
1899   DemandedBits *DB;
1900 
1901   /// Assumption cache.
1902   AssumptionCache *AC;
1903 
1904   /// Interface to emit optimization remarks.
1905   OptimizationRemarkEmitter *ORE;
1906 
1907   const Function *TheFunction;
1908 
1909   /// Loop Vectorize Hint.
1910   const LoopVectorizeHints *Hints;
1911 
1912   /// The interleave access information contains groups of interleaved accesses
1913   /// with the same stride and close to each other.
1914   InterleavedAccessInfo &InterleaveInfo;
1915 
1916   /// Values to ignore in the cost model.
1917   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1918 
1919   /// Values to ignore in the cost model when VF > 1.
1920   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1921 
1922   /// All element types found in the loop.
1923   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1924 };
1925 } // end namespace llvm
1926 
1927 namespace {
1928 /// Helper struct to manage generating runtime checks for vectorization.
1929 ///
1930 /// The runtime checks are created up-front in temporary blocks to allow better
1931 /// estimating the cost and un-linked from the existing IR. After deciding to
1932 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1933 /// temporary blocks are completely removed.
1934 class GeneratedRTChecks {
1935   /// Basic block which contains the generated SCEV checks, if any.
1936   BasicBlock *SCEVCheckBlock = nullptr;
1937 
1938   /// The value representing the result of the generated SCEV checks. If it is
1939   /// nullptr, either no SCEV checks have been generated or they have been used.
1940   Value *SCEVCheckCond = nullptr;
1941 
1942   /// Basic block which contains the generated memory runtime checks, if any.
1943   BasicBlock *MemCheckBlock = nullptr;
1944 
1945   /// The value representing the result of the generated memory runtime checks.
1946   /// If it is nullptr, either no memory runtime checks have been generated or
1947   /// they have been used.
1948   Value *MemRuntimeCheckCond = nullptr;
1949 
1950   DominatorTree *DT;
1951   LoopInfo *LI;
1952   TargetTransformInfo *TTI;
1953 
1954   SCEVExpander SCEVExp;
1955   SCEVExpander MemCheckExp;
1956 
1957   bool CostTooHigh = false;
1958   const bool AddBranchWeights;
1959 
1960 public:
1961   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1962                     TargetTransformInfo *TTI, const DataLayout &DL,
1963                     bool AddBranchWeights)
1964       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1965         MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1966 
1967   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1968   /// accurately estimate the cost of the runtime checks. The blocks are
1969   /// un-linked from the IR and is added back during vector code generation. If
1970   /// there is no vector code generation, the check blocks are removed
1971   /// completely.
1972   void Create(Loop *L, const LoopAccessInfo &LAI,
1973               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1974 
1975     // Hard cutoff to limit compile-time increase in case a very large number of
1976     // runtime checks needs to be generated.
1977     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1978     // profile info.
1979     CostTooHigh =
1980         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1981     if (CostTooHigh)
1982       return;
1983 
1984     BasicBlock *LoopHeader = L->getHeader();
1985     BasicBlock *Preheader = L->getLoopPreheader();
1986 
1987     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1988     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1989     // may be used by SCEVExpander. The blocks will be un-linked from their
1990     // predecessors and removed from LI & DT at the end of the function.
1991     if (!UnionPred.isAlwaysTrue()) {
1992       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1993                                   nullptr, "vector.scevcheck");
1994 
1995       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1996           &UnionPred, SCEVCheckBlock->getTerminator());
1997     }
1998 
1999     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2000     if (RtPtrChecking.Need) {
2001       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2002       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2003                                  "vector.memcheck");
2004 
2005       auto DiffChecks = RtPtrChecking.getDiffChecks();
2006       if (DiffChecks) {
2007         Value *RuntimeVF = nullptr;
2008         MemRuntimeCheckCond = addDiffRuntimeChecks(
2009             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
2010             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
2011               if (!RuntimeVF)
2012                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2013               return RuntimeVF;
2014             },
2015             IC);
2016       } else {
2017         MemRuntimeCheckCond = addRuntimeChecks(
2018             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
2019             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
2020       }
2021       assert(MemRuntimeCheckCond &&
2022              "no RT checks generated although RtPtrChecking "
2023              "claimed checks are required");
2024     }
2025 
2026     if (!MemCheckBlock && !SCEVCheckBlock)
2027       return;
2028 
2029     // Unhook the temporary block with the checks, update various places
2030     // accordingly.
2031     if (SCEVCheckBlock)
2032       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2033     if (MemCheckBlock)
2034       MemCheckBlock->replaceAllUsesWith(Preheader);
2035 
2036     if (SCEVCheckBlock) {
2037       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2038       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2039       Preheader->getTerminator()->eraseFromParent();
2040     }
2041     if (MemCheckBlock) {
2042       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2043       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2044       Preheader->getTerminator()->eraseFromParent();
2045     }
2046 
2047     DT->changeImmediateDominator(LoopHeader, Preheader);
2048     if (MemCheckBlock) {
2049       DT->eraseNode(MemCheckBlock);
2050       LI->removeBlock(MemCheckBlock);
2051     }
2052     if (SCEVCheckBlock) {
2053       DT->eraseNode(SCEVCheckBlock);
2054       LI->removeBlock(SCEVCheckBlock);
2055     }
2056   }
2057 
2058   InstructionCost getCost() {
2059     if (SCEVCheckBlock || MemCheckBlock)
2060       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2061 
2062     if (CostTooHigh) {
2063       InstructionCost Cost;
2064       Cost.setInvalid();
2065       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
2066       return Cost;
2067     }
2068 
2069     InstructionCost RTCheckCost = 0;
2070     if (SCEVCheckBlock)
2071       for (Instruction &I : *SCEVCheckBlock) {
2072         if (SCEVCheckBlock->getTerminator() == &I)
2073           continue;
2074         InstructionCost C =
2075             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2076         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2077         RTCheckCost += C;
2078       }
2079     if (MemCheckBlock)
2080       for (Instruction &I : *MemCheckBlock) {
2081         if (MemCheckBlock->getTerminator() == &I)
2082           continue;
2083         InstructionCost C =
2084             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2085         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2086         RTCheckCost += C;
2087       }
2088 
2089     if (SCEVCheckBlock || MemCheckBlock)
2090       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2091                         << "\n");
2092 
2093     return RTCheckCost;
2094   }
2095 
2096   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2097   /// unused.
2098   ~GeneratedRTChecks() {
2099     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2100     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2101     if (!SCEVCheckCond)
2102       SCEVCleaner.markResultUsed();
2103 
2104     if (!MemRuntimeCheckCond)
2105       MemCheckCleaner.markResultUsed();
2106 
2107     if (MemRuntimeCheckCond) {
2108       auto &SE = *MemCheckExp.getSE();
2109       // Memory runtime check generation creates compares that use expanded
2110       // values. Remove them before running the SCEVExpanderCleaners.
2111       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2112         if (MemCheckExp.isInsertedInstruction(&I))
2113           continue;
2114         SE.forgetValue(&I);
2115         I.eraseFromParent();
2116       }
2117     }
2118     MemCheckCleaner.cleanup();
2119     SCEVCleaner.cleanup();
2120 
2121     if (SCEVCheckCond)
2122       SCEVCheckBlock->eraseFromParent();
2123     if (MemRuntimeCheckCond)
2124       MemCheckBlock->eraseFromParent();
2125   }
2126 
2127   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2128   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2129   /// depending on the generated condition.
2130   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2131                              BasicBlock *LoopVectorPreHeader,
2132                              BasicBlock *LoopExitBlock) {
2133     if (!SCEVCheckCond)
2134       return nullptr;
2135 
2136     Value *Cond = SCEVCheckCond;
2137     // Mark the check as used, to prevent it from being removed during cleanup.
2138     SCEVCheckCond = nullptr;
2139     if (auto *C = dyn_cast<ConstantInt>(Cond))
2140       if (C->isZero())
2141         return nullptr;
2142 
2143     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2144 
2145     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2146     // Create new preheader for vector loop.
2147     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2148       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2149 
2150     SCEVCheckBlock->getTerminator()->eraseFromParent();
2151     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2152     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2153                                                 SCEVCheckBlock);
2154 
2155     DT->addNewBlock(SCEVCheckBlock, Pred);
2156     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2157 
2158     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2159     if (AddBranchWeights)
2160       setBranchWeights(BI, SCEVCheckBypassWeights);
2161     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2162     return SCEVCheckBlock;
2163   }
2164 
2165   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2166   /// the branches to branch to the vector preheader or \p Bypass, depending on
2167   /// the generated condition.
2168   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2169                                    BasicBlock *LoopVectorPreHeader) {
2170     // Check if we generated code that checks in runtime if arrays overlap.
2171     if (!MemRuntimeCheckCond)
2172       return nullptr;
2173 
2174     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2175     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2176                                                 MemCheckBlock);
2177 
2178     DT->addNewBlock(MemCheckBlock, Pred);
2179     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2180     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2181 
2182     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2183       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2184 
2185     BranchInst &BI =
2186         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2187     if (AddBranchWeights) {
2188       setBranchWeights(BI, MemCheckBypassWeights);
2189     }
2190     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2191     MemCheckBlock->getTerminator()->setDebugLoc(
2192         Pred->getTerminator()->getDebugLoc());
2193 
2194     // Mark the check as used, to prevent it from being removed during cleanup.
2195     MemRuntimeCheckCond = nullptr;
2196     return MemCheckBlock;
2197   }
2198 };
2199 } // namespace
2200 
2201 static bool useActiveLaneMask(TailFoldingStyle Style) {
2202   return Style == TailFoldingStyle::Data ||
2203          Style == TailFoldingStyle::DataAndControlFlow ||
2204          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2205 }
2206 
2207 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2208   return Style == TailFoldingStyle::DataAndControlFlow ||
2209          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2210 }
2211 
2212 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2213 // vectorization. The loop needs to be annotated with #pragma omp simd
2214 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2215 // vector length information is not provided, vectorization is not considered
2216 // explicit. Interleave hints are not allowed either. These limitations will be
2217 // relaxed in the future.
2218 // Please, note that we are currently forced to abuse the pragma 'clang
2219 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2220 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2221 // provides *explicit vectorization hints* (LV can bypass legal checks and
2222 // assume that vectorization is legal). However, both hints are implemented
2223 // using the same metadata (llvm.loop.vectorize, processed by
2224 // LoopVectorizeHints). This will be fixed in the future when the native IR
2225 // representation for pragma 'omp simd' is introduced.
2226 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2227                                    OptimizationRemarkEmitter *ORE) {
2228   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2229   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2230 
2231   // Only outer loops with an explicit vectorization hint are supported.
2232   // Unannotated outer loops are ignored.
2233   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2234     return false;
2235 
2236   Function *Fn = OuterLp->getHeader()->getParent();
2237   if (!Hints.allowVectorization(Fn, OuterLp,
2238                                 true /*VectorizeOnlyWhenForced*/)) {
2239     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2240     return false;
2241   }
2242 
2243   if (Hints.getInterleave() > 1) {
2244     // TODO: Interleave support is future work.
2245     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2246                          "outer loops.\n");
2247     Hints.emitRemarkWithHints();
2248     return false;
2249   }
2250 
2251   return true;
2252 }
2253 
2254 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2255                                   OptimizationRemarkEmitter *ORE,
2256                                   SmallVectorImpl<Loop *> &V) {
2257   // Collect inner loops and outer loops without irreducible control flow. For
2258   // now, only collect outer loops that have explicit vectorization hints. If we
2259   // are stress testing the VPlan H-CFG construction, we collect the outermost
2260   // loop of every loop nest.
2261   if (L.isInnermost() || VPlanBuildStressTest ||
2262       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2263     LoopBlocksRPO RPOT(&L);
2264     RPOT.perform(LI);
2265     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2266       V.push_back(&L);
2267       // TODO: Collect inner loops inside marked outer loops in case
2268       // vectorization fails for the outer loop. Do not invoke
2269       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2270       // already known to be reducible. We can use an inherited attribute for
2271       // that.
2272       return;
2273     }
2274   }
2275   for (Loop *InnerL : L)
2276     collectSupportedLoops(*InnerL, LI, ORE, V);
2277 }
2278 
2279 //===----------------------------------------------------------------------===//
2280 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2281 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2282 //===----------------------------------------------------------------------===//
2283 
2284 /// Compute the transformed value of Index at offset StartValue using step
2285 /// StepValue.
2286 /// For integer induction, returns StartValue + Index * StepValue.
2287 /// For pointer induction, returns StartValue[Index * StepValue].
2288 /// FIXME: The newly created binary instructions should contain nsw/nuw
2289 /// flags, which can be found from the original scalar operations.
2290 static Value *
2291 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2292                      Value *Step,
2293                      InductionDescriptor::InductionKind InductionKind,
2294                      const BinaryOperator *InductionBinOp) {
2295   Type *StepTy = Step->getType();
2296   Value *CastedIndex = StepTy->isIntegerTy()
2297                            ? B.CreateSExtOrTrunc(Index, StepTy)
2298                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2299   if (CastedIndex != Index) {
2300     CastedIndex->setName(CastedIndex->getName() + ".cast");
2301     Index = CastedIndex;
2302   }
2303 
2304   // Note: the IR at this point is broken. We cannot use SE to create any new
2305   // SCEV and then expand it, hoping that SCEV's simplification will give us
2306   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2307   // lead to various SCEV crashes. So all we can do is to use builder and rely
2308   // on InstCombine for future simplifications. Here we handle some trivial
2309   // cases only.
2310   auto CreateAdd = [&B](Value *X, Value *Y) {
2311     assert(X->getType() == Y->getType() && "Types don't match!");
2312     if (auto *CX = dyn_cast<ConstantInt>(X))
2313       if (CX->isZero())
2314         return Y;
2315     if (auto *CY = dyn_cast<ConstantInt>(Y))
2316       if (CY->isZero())
2317         return X;
2318     return B.CreateAdd(X, Y);
2319   };
2320 
2321   // We allow X to be a vector type, in which case Y will potentially be
2322   // splatted into a vector with the same element count.
2323   auto CreateMul = [&B](Value *X, Value *Y) {
2324     assert(X->getType()->getScalarType() == Y->getType() &&
2325            "Types don't match!");
2326     if (auto *CX = dyn_cast<ConstantInt>(X))
2327       if (CX->isOne())
2328         return Y;
2329     if (auto *CY = dyn_cast<ConstantInt>(Y))
2330       if (CY->isOne())
2331         return X;
2332     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2333     if (XVTy && !isa<VectorType>(Y->getType()))
2334       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2335     return B.CreateMul(X, Y);
2336   };
2337 
2338   switch (InductionKind) {
2339   case InductionDescriptor::IK_IntInduction: {
2340     assert(!isa<VectorType>(Index->getType()) &&
2341            "Vector indices not supported for integer inductions yet");
2342     assert(Index->getType() == StartValue->getType() &&
2343            "Index type does not match StartValue type");
2344     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2345       return B.CreateSub(StartValue, Index);
2346     auto *Offset = CreateMul(Index, Step);
2347     return CreateAdd(StartValue, Offset);
2348   }
2349   case InductionDescriptor::IK_PtrInduction: {
2350     return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step));
2351   }
2352   case InductionDescriptor::IK_FpInduction: {
2353     assert(!isa<VectorType>(Index->getType()) &&
2354            "Vector indices not supported for FP inductions yet");
2355     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2356     assert(InductionBinOp &&
2357            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2358             InductionBinOp->getOpcode() == Instruction::FSub) &&
2359            "Original bin op should be defined for FP induction");
2360 
2361     Value *MulExp = B.CreateFMul(Step, Index);
2362     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2363                          "induction");
2364   }
2365   case InductionDescriptor::IK_NoInduction:
2366     return nullptr;
2367   }
2368   llvm_unreachable("invalid enum");
2369 }
2370 
2371 std::optional<unsigned> getMaxVScale(const Function &F,
2372                                      const TargetTransformInfo &TTI) {
2373   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2374     return MaxVScale;
2375 
2376   if (F.hasFnAttribute(Attribute::VScaleRange))
2377     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2378 
2379   return std::nullopt;
2380 }
2381 
2382 /// For the given VF and UF and maximum trip count computed for the loop, return
2383 /// whether the induction variable might overflow in the vectorized loop. If not,
2384 /// then we know a runtime overflow check always evaluates to false and can be
2385 /// removed.
2386 static bool isIndvarOverflowCheckKnownFalse(
2387     const LoopVectorizationCostModel *Cost,
2388     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2389   // Always be conservative if we don't know the exact unroll factor.
2390   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2391 
2392   Type *IdxTy = Cost->Legal->getWidestInductionType();
2393   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2394 
2395   // We know the runtime overflow check is known false iff the (max) trip-count
2396   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2397   // the vector loop induction variable.
2398   if (unsigned TC =
2399           Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2400     uint64_t MaxVF = VF.getKnownMinValue();
2401     if (VF.isScalable()) {
2402       std::optional<unsigned> MaxVScale =
2403           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2404       if (!MaxVScale)
2405         return false;
2406       MaxVF *= *MaxVScale;
2407     }
2408 
2409     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2410   }
2411 
2412   return false;
2413 }
2414 
2415 // Return whether we allow using masked interleave-groups (for dealing with
2416 // strided loads/stores that reside in predicated blocks, or for dealing
2417 // with gaps).
2418 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2419   // If an override option has been passed in for interleaved accesses, use it.
2420   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2421     return EnableMaskedInterleavedMemAccesses;
2422 
2423   return TTI.enableMaskedInterleavedAccessVectorization();
2424 }
2425 
2426 // Try to vectorize the interleave group that \p Instr belongs to.
2427 //
2428 // E.g. Translate following interleaved load group (factor = 3):
2429 //   for (i = 0; i < N; i+=3) {
2430 //     R = Pic[i];             // Member of index 0
2431 //     G = Pic[i+1];           // Member of index 1
2432 //     B = Pic[i+2];           // Member of index 2
2433 //     ... // do something to R, G, B
2434 //   }
2435 // To:
2436 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2437 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2438 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2439 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2440 //
2441 // Or translate following interleaved store group (factor = 3):
2442 //   for (i = 0; i < N; i+=3) {
2443 //     ... do something to R, G, B
2444 //     Pic[i]   = R;           // Member of index 0
2445 //     Pic[i+1] = G;           // Member of index 1
2446 //     Pic[i+2] = B;           // Member of index 2
2447 //   }
2448 // To:
2449 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2450 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2451 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2452 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2453 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2454 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2455     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2456     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2457     VPValue *BlockInMask, bool NeedsMaskForGaps) {
2458   Instruction *Instr = Group->getInsertPos();
2459   const DataLayout &DL = Instr->getModule()->getDataLayout();
2460 
2461   // Prepare for the vector type of the interleaved load/store.
2462   Type *ScalarTy = getLoadStoreType(Instr);
2463   unsigned InterleaveFactor = Group->getFactor();
2464   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2465 
2466   // Prepare for the new pointers.
2467   SmallVector<Value *, 2> AddrParts;
2468   unsigned Index = Group->getIndex(Instr);
2469 
2470   // TODO: extend the masked interleaved-group support to reversed access.
2471   assert((!BlockInMask || !Group->isReverse()) &&
2472          "Reversed masked interleave-group not supported.");
2473 
2474   Value *Idx;
2475   // If the group is reverse, adjust the index to refer to the last vector lane
2476   // instead of the first. We adjust the index from the first vector lane,
2477   // rather than directly getting the pointer for lane VF - 1, because the
2478   // pointer operand of the interleaved access is supposed to be uniform. For
2479   // uniform instructions, we're only required to generate a value for the
2480   // first vector lane in each unroll iteration.
2481   if (Group->isReverse()) {
2482     Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2483     Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2484     Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
2485     Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
2486     Idx = Builder.CreateNeg(Idx);
2487   } else
2488     Idx = Builder.getInt32(-Index);
2489 
2490   for (unsigned Part = 0; Part < UF; Part++) {
2491     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2492     if (auto *I = dyn_cast<Instruction>(AddrPart))
2493       State.setDebugLocFrom(I->getDebugLoc());
2494 
2495     // Notice current instruction could be any index. Need to adjust the address
2496     // to the member of index 0.
2497     //
2498     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2499     //       b = A[i];       // Member of index 0
2500     // Current pointer is pointed to A[i+1], adjust it to A[i].
2501     //
2502     // E.g.  A[i+1] = a;     // Member of index 1
2503     //       A[i]   = b;     // Member of index 0
2504     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2505     // Current pointer is pointed to A[i+2], adjust it to A[i].
2506 
2507     bool InBounds = false;
2508     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2509       InBounds = gep->isInBounds();
2510     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2511     AddrParts.push_back(AddrPart);
2512   }
2513 
2514   State.setDebugLocFrom(Instr->getDebugLoc());
2515   Value *PoisonVec = PoisonValue::get(VecTy);
2516 
2517   auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2518                              unsigned Part, Value *MaskForGaps) -> Value * {
2519     if (VF.isScalable()) {
2520       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2521       assert(InterleaveFactor == 2 &&
2522              "Unsupported deinterleave factor for scalable vectors");
2523       auto *BlockInMaskPart = State.get(BlockInMask, Part);
2524       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2525       auto *MaskTy =
2526           VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
2527       return Builder.CreateIntrinsic(
2528           MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2529           /*FMFSource=*/nullptr, "interleaved.mask");
2530     }
2531 
2532     if (!BlockInMask)
2533       return MaskForGaps;
2534 
2535     Value *BlockInMaskPart = State.get(BlockInMask, Part);
2536     Value *ShuffledMask = Builder.CreateShuffleVector(
2537         BlockInMaskPart,
2538         createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2539         "interleaved.mask");
2540     return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2541                                              MaskForGaps)
2542                        : ShuffledMask;
2543   };
2544 
2545   // Vectorize the interleaved load group.
2546   if (isa<LoadInst>(Instr)) {
2547     Value *MaskForGaps = nullptr;
2548     if (NeedsMaskForGaps) {
2549       MaskForGaps =
2550           createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2551       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2552     }
2553 
2554     // For each unroll part, create a wide load for the group.
2555     SmallVector<Value *, 2> NewLoads;
2556     for (unsigned Part = 0; Part < UF; Part++) {
2557       Instruction *NewLoad;
2558       if (BlockInMask || MaskForGaps) {
2559         assert(useMaskedInterleavedAccesses(*TTI) &&
2560                "masked interleaved groups are not allowed.");
2561         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2562         NewLoad =
2563             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2564                                      GroupMask, PoisonVec, "wide.masked.vec");
2565       }
2566       else
2567         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2568                                             Group->getAlign(), "wide.vec");
2569       Group->addMetadata(NewLoad);
2570       NewLoads.push_back(NewLoad);
2571     }
2572 
2573     if (VecTy->isScalableTy()) {
2574       assert(InterleaveFactor == 2 &&
2575              "Unsupported deinterleave factor for scalable vectors");
2576 
2577       for (unsigned Part = 0; Part < UF; ++Part) {
2578         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2579         // so must use intrinsics to deinterleave.
2580         Value *DI = Builder.CreateIntrinsic(
2581             Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2582             /*FMFSource=*/nullptr, "strided.vec");
2583         unsigned J = 0;
2584         for (unsigned I = 0; I < InterleaveFactor; ++I) {
2585           Instruction *Member = Group->getMember(I);
2586 
2587           if (!Member)
2588             continue;
2589 
2590           Value *StridedVec = Builder.CreateExtractValue(DI, I);
2591           // If this member has different type, cast the result type.
2592           if (Member->getType() != ScalarTy) {
2593             VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2594             StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2595           }
2596 
2597           if (Group->isReverse())
2598             StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2599 
2600           State.set(VPDefs[J], StridedVec, Part);
2601           ++J;
2602         }
2603       }
2604 
2605       return;
2606     }
2607 
2608     // For each member in the group, shuffle out the appropriate data from the
2609     // wide loads.
2610     unsigned J = 0;
2611     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2612       Instruction *Member = Group->getMember(I);
2613 
2614       // Skip the gaps in the group.
2615       if (!Member)
2616         continue;
2617 
2618       auto StrideMask =
2619           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2620       for (unsigned Part = 0; Part < UF; Part++) {
2621         Value *StridedVec = Builder.CreateShuffleVector(
2622             NewLoads[Part], StrideMask, "strided.vec");
2623 
2624         // If this member has different type, cast the result type.
2625         if (Member->getType() != ScalarTy) {
2626           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2627           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2628           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2629         }
2630 
2631         if (Group->isReverse())
2632           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2633 
2634         State.set(VPDefs[J], StridedVec, Part);
2635       }
2636       ++J;
2637     }
2638     return;
2639   }
2640 
2641   // The sub vector type for current instruction.
2642   auto *SubVT = VectorType::get(ScalarTy, VF);
2643 
2644   // Vectorize the interleaved store group.
2645   Value *MaskForGaps =
2646       createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2647   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2648          "masked interleaved groups are not allowed.");
2649   assert((!MaskForGaps || !VF.isScalable()) &&
2650          "masking gaps for scalable vectors is not yet supported.");
2651   for (unsigned Part = 0; Part < UF; Part++) {
2652     // Collect the stored vector from each member.
2653     SmallVector<Value *, 4> StoredVecs;
2654     unsigned StoredIdx = 0;
2655     for (unsigned i = 0; i < InterleaveFactor; i++) {
2656       assert((Group->getMember(i) || MaskForGaps) &&
2657              "Fail to get a member from an interleaved store group");
2658       Instruction *Member = Group->getMember(i);
2659 
2660       // Skip the gaps in the group.
2661       if (!Member) {
2662         Value *Undef = PoisonValue::get(SubVT);
2663         StoredVecs.push_back(Undef);
2664         continue;
2665       }
2666 
2667       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2668       ++StoredIdx;
2669 
2670       if (Group->isReverse())
2671         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2672 
2673       // If this member has different type, cast it to a unified type.
2674 
2675       if (StoredVec->getType() != SubVT)
2676         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2677 
2678       StoredVecs.push_back(StoredVec);
2679     }
2680 
2681     // Interleave all the smaller vectors into one wider vector.
2682     Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2683     Instruction *NewStoreInstr;
2684     if (BlockInMask || MaskForGaps) {
2685       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2686       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2687                                                 Group->getAlign(), GroupMask);
2688     } else
2689       NewStoreInstr =
2690           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2691 
2692     Group->addMetadata(NewStoreInstr);
2693   }
2694 }
2695 
2696 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2697                                                VPReplicateRecipe *RepRecipe,
2698                                                const VPIteration &Instance,
2699                                                VPTransformState &State) {
2700   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2701 
2702   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2703   // the first lane and part.
2704   if (isa<NoAliasScopeDeclInst>(Instr))
2705     if (!Instance.isFirstIteration())
2706       return;
2707 
2708   // Does this instruction return a value ?
2709   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2710 
2711   Instruction *Cloned = Instr->clone();
2712   if (!IsVoidRetTy) {
2713     Cloned->setName(Instr->getName() + ".cloned");
2714 #if !defined(NDEBUG)
2715     // Verify that VPlan type inference results agree with the type of the
2716     // generated values.
2717     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2718            "inferred type and type from generated instructions do not match");
2719 #endif
2720   }
2721 
2722   RepRecipe->setFlags(Cloned);
2723 
2724   if (auto DL = Instr->getDebugLoc())
2725     State.setDebugLocFrom(DL);
2726 
2727   // Replace the operands of the cloned instructions with their scalar
2728   // equivalents in the new loop.
2729   for (const auto &I : enumerate(RepRecipe->operands())) {
2730     auto InputInstance = Instance;
2731     VPValue *Operand = I.value();
2732     if (vputils::isUniformAfterVectorization(Operand))
2733       InputInstance.Lane = VPLane::getFirstLane();
2734     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2735   }
2736   State.addNewMetadata(Cloned, Instr);
2737 
2738   // Place the cloned scalar in the new loop.
2739   State.Builder.Insert(Cloned);
2740 
2741   State.set(RepRecipe, Cloned, Instance);
2742 
2743   // If we just cloned a new assumption, add it the assumption cache.
2744   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2745     AC->registerAssumption(II);
2746 
2747   // End if-block.
2748   bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2749   if (IfPredicateInstr)
2750     PredicatedInstructions.push_back(Cloned);
2751 }
2752 
2753 Value *
2754 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2755   if (VectorTripCount)
2756     return VectorTripCount;
2757 
2758   Value *TC = getTripCount();
2759   IRBuilder<> Builder(InsertBlock->getTerminator());
2760 
2761   Type *Ty = TC->getType();
2762   // This is where we can make the step a runtime constant.
2763   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2764 
2765   // If the tail is to be folded by masking, round the number of iterations N
2766   // up to a multiple of Step instead of rounding down. This is done by first
2767   // adding Step-1 and then rounding down. Note that it's ok if this addition
2768   // overflows: the vector induction variable will eventually wrap to zero given
2769   // that it starts at zero and its Step is a power of two; the loop will then
2770   // exit, with the last early-exit vector comparison also producing all-true.
2771   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2772   // is accounted for in emitIterationCountCheck that adds an overflow check.
2773   if (Cost->foldTailByMasking()) {
2774     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2775            "VF*UF must be a power of 2 when folding tail by masking");
2776     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2777     TC = Builder.CreateAdd(
2778         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2779   }
2780 
2781   // Now we need to generate the expression for the part of the loop that the
2782   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2783   // iterations are not required for correctness, or N - Step, otherwise. Step
2784   // is equal to the vectorization factor (number of SIMD elements) times the
2785   // unroll factor (number of SIMD instructions).
2786   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2787 
2788   // There are cases where we *must* run at least one iteration in the remainder
2789   // loop.  See the cost model for when this can happen.  If the step evenly
2790   // divides the trip count, we set the remainder to be equal to the step. If
2791   // the step does not evenly divide the trip count, no adjustment is necessary
2792   // since there will already be scalar iterations. Note that the minimum
2793   // iterations check ensures that N >= Step.
2794   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2795     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2796     R = Builder.CreateSelect(IsZero, Step, R);
2797   }
2798 
2799   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2800 
2801   return VectorTripCount;
2802 }
2803 
2804 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2805                                                    const DataLayout &DL) {
2806   // Verify that V is a vector type with same number of elements as DstVTy.
2807   auto *DstFVTy = cast<VectorType>(DstVTy);
2808   auto VF = DstFVTy->getElementCount();
2809   auto *SrcVecTy = cast<VectorType>(V->getType());
2810   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2811   Type *SrcElemTy = SrcVecTy->getElementType();
2812   Type *DstElemTy = DstFVTy->getElementType();
2813   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2814          "Vector elements must have same size");
2815 
2816   // Do a direct cast if element types are castable.
2817   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2818     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2819   }
2820   // V cannot be directly casted to desired vector type.
2821   // May happen when V is a floating point vector but DstVTy is a vector of
2822   // pointers or vice-versa. Handle this using a two-step bitcast using an
2823   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2824   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2825          "Only one type should be a pointer type");
2826   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2827          "Only one type should be a floating point type");
2828   Type *IntTy =
2829       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2830   auto *VecIntTy = VectorType::get(IntTy, VF);
2831   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2832   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2833 }
2834 
2835 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2836   Value *Count = getTripCount();
2837   // Reuse existing vector loop preheader for TC checks.
2838   // Note that new preheader block is generated for vector loop.
2839   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2840   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2841 
2842   // Generate code to check if the loop's trip count is less than VF * UF, or
2843   // equal to it in case a scalar epilogue is required; this implies that the
2844   // vector trip count is zero. This check also covers the case where adding one
2845   // to the backedge-taken count overflowed leading to an incorrect trip count
2846   // of zero. In this case we will also jump to the scalar loop.
2847   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2848                                                        : ICmpInst::ICMP_ULT;
2849 
2850   // If tail is to be folded, vector loop takes care of all iterations.
2851   Type *CountTy = Count->getType();
2852   Value *CheckMinIters = Builder.getFalse();
2853   auto CreateStep = [&]() -> Value * {
2854     // Create step with max(MinProTripCount, UF * VF).
2855     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2856       return createStepForVF(Builder, CountTy, VF, UF);
2857 
2858     Value *MinProfTC =
2859         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2860     if (!VF.isScalable())
2861       return MinProfTC;
2862     return Builder.CreateBinaryIntrinsic(
2863         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2864   };
2865 
2866   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2867   if (Style == TailFoldingStyle::None)
2868     CheckMinIters =
2869         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2870   else if (VF.isScalable() &&
2871            !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2872            Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2873     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2874     // an overflow to zero when updating induction variables and so an
2875     // additional overflow check is required before entering the vector loop.
2876 
2877     // Get the maximum unsigned value for the type.
2878     Value *MaxUIntTripCount =
2879         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2880     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2881 
2882     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2883     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2884   }
2885 
2886   // Create new preheader for vector loop.
2887   LoopVectorPreHeader =
2888       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2889                  "vector.ph");
2890 
2891   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2892                                DT->getNode(Bypass)->getIDom()) &&
2893          "TC check is expected to dominate Bypass");
2894 
2895   // Update dominator for Bypass & LoopExit (if needed).
2896   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2897   if (!Cost->requiresScalarEpilogue(VF.isVector()))
2898     // If there is an epilogue which must run, there's no edge from the
2899     // middle block to exit blocks  and thus no need to update the immediate
2900     // dominator of the exit blocks.
2901     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2902 
2903   BranchInst &BI =
2904       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2905   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2906     setBranchWeights(BI, MinItersBypassWeights);
2907   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2908   LoopBypassBlocks.push_back(TCCheckBlock);
2909 }
2910 
2911 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2912   BasicBlock *const SCEVCheckBlock =
2913       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2914   if (!SCEVCheckBlock)
2915     return nullptr;
2916 
2917   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2918            (OptForSizeBasedOnProfile &&
2919             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2920          "Cannot SCEV check stride or overflow when optimizing for size");
2921 
2922 
2923   // Update dominator only if this is first RT check.
2924   if (LoopBypassBlocks.empty()) {
2925     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2926     if (!Cost->requiresScalarEpilogue(VF.isVector()))
2927       // If there is an epilogue which must run, there's no edge from the
2928       // middle block to exit blocks  and thus no need to update the immediate
2929       // dominator of the exit blocks.
2930       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2931   }
2932 
2933   LoopBypassBlocks.push_back(SCEVCheckBlock);
2934   AddedSafetyChecks = true;
2935   return SCEVCheckBlock;
2936 }
2937 
2938 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2939   // VPlan-native path does not do any analysis for runtime checks currently.
2940   if (EnableVPlanNativePath)
2941     return nullptr;
2942 
2943   BasicBlock *const MemCheckBlock =
2944       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2945 
2946   // Check if we generated code that checks in runtime if arrays overlap. We put
2947   // the checks into a separate block to make the more common case of few
2948   // elements faster.
2949   if (!MemCheckBlock)
2950     return nullptr;
2951 
2952   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2953     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2954            "Cannot emit memory checks when optimizing for size, unless forced "
2955            "to vectorize.");
2956     ORE->emit([&]() {
2957       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2958                                         OrigLoop->getStartLoc(),
2959                                         OrigLoop->getHeader())
2960              << "Code-size may be reduced by not forcing "
2961                 "vectorization, or by source-code modifications "
2962                 "eliminating the need for runtime checks "
2963                 "(e.g., adding 'restrict').";
2964     });
2965   }
2966 
2967   LoopBypassBlocks.push_back(MemCheckBlock);
2968 
2969   AddedSafetyChecks = true;
2970 
2971   return MemCheckBlock;
2972 }
2973 
2974 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2975   LoopScalarBody = OrigLoop->getHeader();
2976   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2977   assert(LoopVectorPreHeader && "Invalid loop structure");
2978   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2979   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2980          "multiple exit loop without required epilogue?");
2981 
2982   LoopMiddleBlock =
2983       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2984                  LI, nullptr, Twine(Prefix) + "middle.block");
2985   LoopScalarPreHeader =
2986       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2987                  nullptr, Twine(Prefix) + "scalar.ph");
2988 
2989   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2990 
2991   // Set up the middle block terminator.  Two cases:
2992   // 1) If we know that we must execute the scalar epilogue, emit an
2993   //    unconditional branch.
2994   // 2) Otherwise, we must have a single unique exit block (due to how we
2995   //    implement the multiple exit case).  In this case, set up a conditional
2996   //    branch from the middle block to the loop scalar preheader, and the
2997   //    exit block.  completeLoopSkeleton will update the condition to use an
2998   //    iteration check, if required to decide whether to execute the remainder.
2999   BranchInst *BrInst =
3000       Cost->requiresScalarEpilogue(VF.isVector())
3001           ? BranchInst::Create(LoopScalarPreHeader)
3002           : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3003                                Builder.getTrue());
3004   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3005   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3006 
3007   // Update dominator for loop exit. During skeleton creation, only the vector
3008   // pre-header and the middle block are created. The vector loop is entirely
3009   // created during VPlan exection.
3010   if (!Cost->requiresScalarEpilogue(VF.isVector()))
3011     // If there is an epilogue which must run, there's no edge from the
3012     // middle block to exit blocks  and thus no need to update the immediate
3013     // dominator of the exit blocks.
3014     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3015 }
3016 
3017 PHINode *InnerLoopVectorizer::createInductionResumeValue(
3018     PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3019     ArrayRef<BasicBlock *> BypassBlocks,
3020     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3021   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3022   assert(VectorTripCount && "Expected valid arguments");
3023 
3024   Instruction *OldInduction = Legal->getPrimaryInduction();
3025   Value *&EndValue = IVEndValues[OrigPhi];
3026   Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3027   if (OrigPhi == OldInduction) {
3028     // We know what the end value is.
3029     EndValue = VectorTripCount;
3030   } else {
3031     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3032 
3033     // Fast-math-flags propagate from the original induction instruction.
3034     if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3035       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3036 
3037     EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
3038                                     Step, II.getKind(), II.getInductionBinOp());
3039     EndValue->setName("ind.end");
3040 
3041     // Compute the end value for the additional bypass (if applicable).
3042     if (AdditionalBypass.first) {
3043       B.SetInsertPoint(AdditionalBypass.first,
3044                        AdditionalBypass.first->getFirstInsertionPt());
3045       EndValueFromAdditionalBypass =
3046           emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3047                                Step, II.getKind(), II.getInductionBinOp());
3048       EndValueFromAdditionalBypass->setName("ind.end");
3049     }
3050   }
3051 
3052   // Create phi nodes to merge from the  backedge-taken check block.
3053   PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3054                                          LoopScalarPreHeader->getTerminator());
3055   // Copy original phi DL over to the new one.
3056   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3057 
3058   // The new PHI merges the original incoming value, in case of a bypass,
3059   // or the value at the end of the vectorized loop.
3060   BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3061 
3062   // Fix the scalar body counter (PHI node).
3063   // The old induction's phi node in the scalar body needs the truncated
3064   // value.
3065   for (BasicBlock *BB : BypassBlocks)
3066     BCResumeVal->addIncoming(II.getStartValue(), BB);
3067 
3068   if (AdditionalBypass.first)
3069     BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3070                                           EndValueFromAdditionalBypass);
3071   return BCResumeVal;
3072 }
3073 
3074 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3075 /// expansion results.
3076 static Value *getExpandedStep(const InductionDescriptor &ID,
3077                               const SCEV2ValueTy &ExpandedSCEVs) {
3078   const SCEV *Step = ID.getStep();
3079   if (auto *C = dyn_cast<SCEVConstant>(Step))
3080     return C->getValue();
3081   if (auto *U = dyn_cast<SCEVUnknown>(Step))
3082     return U->getValue();
3083   auto I = ExpandedSCEVs.find(Step);
3084   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3085   return I->second;
3086 }
3087 
3088 void InnerLoopVectorizer::createInductionResumeValues(
3089     const SCEV2ValueTy &ExpandedSCEVs,
3090     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3091   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3092           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3093          "Inconsistent information about additional bypass.");
3094   // We are going to resume the execution of the scalar loop.
3095   // Go over all of the induction variables that we found and fix the
3096   // PHIs that are left in the scalar version of the loop.
3097   // The starting values of PHI nodes depend on the counter of the last
3098   // iteration in the vectorized loop.
3099   // If we come from a bypass edge then we need to start from the original
3100   // start value.
3101   for (const auto &InductionEntry : Legal->getInductionVars()) {
3102     PHINode *OrigPhi = InductionEntry.first;
3103     const InductionDescriptor &II = InductionEntry.second;
3104     PHINode *BCResumeVal = createInductionResumeValue(
3105         OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3106         AdditionalBypass);
3107     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3108   }
3109 }
3110 
3111 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3112   // The trip counts should be cached by now.
3113   Value *Count = getTripCount();
3114   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3115 
3116   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3117 
3118   // Add a check in the middle block to see if we have completed
3119   // all of the iterations in the first vector loop.  Three cases:
3120   // 1) If we require a scalar epilogue, there is no conditional branch as
3121   //    we unconditionally branch to the scalar preheader.  Do nothing.
3122   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3123   //    Thus if tail is to be folded, we know we don't need to run the
3124   //    remainder and we can use the previous value for the condition (true).
3125   // 3) Otherwise, construct a runtime check.
3126   if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3127       !Cost->foldTailByMasking()) {
3128     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3129     // of the corresponding compare because they may have ended up with
3130     // different line numbers and we want to avoid awkward line stepping while
3131     // debugging. Eg. if the compare has got a line number inside the loop.
3132     // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3133     // operands. Perform simplification directly on VPlan once the branch is
3134     // modeled there.
3135     IRBuilder<> B(LoopMiddleBlock->getTerminator());
3136     B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3137     Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3138     BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3139     BI.setCondition(CmpN);
3140     if (hasBranchWeightMD(*ScalarLatchTerm)) {
3141       // Assume that `Count % VectorTripCount` is equally distributed.
3142       unsigned TripCount = UF * VF.getKnownMinValue();
3143       assert(TripCount > 0 && "trip count should not be zero");
3144       const uint32_t Weights[] = {1, TripCount - 1};
3145       setBranchWeights(BI, Weights);
3146     }
3147   }
3148 
3149 #ifdef EXPENSIVE_CHECKS
3150   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3151 #endif
3152 
3153   return LoopVectorPreHeader;
3154 }
3155 
3156 std::pair<BasicBlock *, Value *>
3157 InnerLoopVectorizer::createVectorizedLoopSkeleton(
3158     const SCEV2ValueTy &ExpandedSCEVs) {
3159   /*
3160    In this function we generate a new loop. The new loop will contain
3161    the vectorized instructions while the old loop will continue to run the
3162    scalar remainder.
3163 
3164        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3165      /  |      preheader are expanded here. Eventually all required SCEV
3166     /   |      expansion should happen here.
3167    /    v
3168   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3169   |  /  |
3170   | /   v
3171   ||   [ ]     <-- vector pre header.
3172   |/    |
3173   |     v
3174   |    [  ] \
3175   |    [  ]_|   <-- vector loop (created during VPlan execution).
3176   |     |
3177   |     v
3178   \   -[ ]   <--- middle-block.
3179    \/   |
3180    /\   v
3181    | ->[ ]     <--- new preheader.
3182    |    |
3183  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3184    |   [ ] \
3185    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3186     \   |
3187      \  v
3188       >[ ]     <-- exit block(s).
3189    ...
3190    */
3191 
3192   // Create an empty vector loop, and prepare basic blocks for the runtime
3193   // checks.
3194   createVectorLoopSkeleton("");
3195 
3196   // Now, compare the new count to zero. If it is zero skip the vector loop and
3197   // jump to the scalar loop. This check also covers the case where the
3198   // backedge-taken count is uint##_max: adding one to it will overflow leading
3199   // to an incorrect trip count of zero. In this (rare) case we will also jump
3200   // to the scalar loop.
3201   emitIterationCountCheck(LoopScalarPreHeader);
3202 
3203   // Generate the code to check any assumptions that we've made for SCEV
3204   // expressions.
3205   emitSCEVChecks(LoopScalarPreHeader);
3206 
3207   // Generate the code that checks in runtime if arrays overlap. We put the
3208   // checks into a separate block to make the more common case of few elements
3209   // faster.
3210   emitMemRuntimeChecks(LoopScalarPreHeader);
3211 
3212   // Emit phis for the new starting index of the scalar loop.
3213   createInductionResumeValues(ExpandedSCEVs);
3214 
3215   return {completeLoopSkeleton(), nullptr};
3216 }
3217 
3218 // Fix up external users of the induction variable. At this point, we are
3219 // in LCSSA form, with all external PHIs that use the IV having one input value,
3220 // coming from the remainder loop. We need those PHIs to also have a correct
3221 // value for the IV when arriving directly from the middle block.
3222 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3223                                        const InductionDescriptor &II,
3224                                        Value *VectorTripCount, Value *EndValue,
3225                                        BasicBlock *MiddleBlock,
3226                                        BasicBlock *VectorHeader, VPlan &Plan,
3227                                        VPTransformState &State) {
3228   // There are two kinds of external IV usages - those that use the value
3229   // computed in the last iteration (the PHI) and those that use the penultimate
3230   // value (the value that feeds into the phi from the loop latch).
3231   // We allow both, but they, obviously, have different values.
3232 
3233   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3234 
3235   DenseMap<Value *, Value *> MissingVals;
3236 
3237   // An external user of the last iteration's value should see the value that
3238   // the remainder loop uses to initialize its own IV.
3239   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3240   for (User *U : PostInc->users()) {
3241     Instruction *UI = cast<Instruction>(U);
3242     if (!OrigLoop->contains(UI)) {
3243       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3244       MissingVals[UI] = EndValue;
3245     }
3246   }
3247 
3248   // An external user of the penultimate value need to see EndValue - Step.
3249   // The simplest way to get this is to recompute it from the constituent SCEVs,
3250   // that is Start + (Step * (CRD - 1)).
3251   for (User *U : OrigPhi->users()) {
3252     auto *UI = cast<Instruction>(U);
3253     if (!OrigLoop->contains(UI)) {
3254       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3255       IRBuilder<> B(MiddleBlock->getTerminator());
3256 
3257       // Fast-math-flags propagate from the original induction instruction.
3258       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3259         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3260 
3261       Value *CountMinusOne = B.CreateSub(
3262           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3263       CountMinusOne->setName("cmo");
3264 
3265       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3266       assert(StepVPV && "step must have been expanded during VPlan execution");
3267       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3268                                         : State.get(StepVPV, {0, 0});
3269       Value *Escape =
3270           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3271                                II.getKind(), II.getInductionBinOp());
3272       Escape->setName("ind.escape");
3273       MissingVals[UI] = Escape;
3274     }
3275   }
3276 
3277   for (auto &I : MissingVals) {
3278     PHINode *PHI = cast<PHINode>(I.first);
3279     // One corner case we have to handle is two IVs "chasing" each-other,
3280     // that is %IV2 = phi [...], [ %IV1, %latch ]
3281     // In this case, if IV1 has an external use, we need to avoid adding both
3282     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3283     // don't already have an incoming value for the middle block.
3284     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3285       PHI->addIncoming(I.second, MiddleBlock);
3286       Plan.removeLiveOut(PHI);
3287     }
3288   }
3289 }
3290 
3291 namespace {
3292 
3293 struct CSEDenseMapInfo {
3294   static bool canHandle(const Instruction *I) {
3295     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3296            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3297   }
3298 
3299   static inline Instruction *getEmptyKey() {
3300     return DenseMapInfo<Instruction *>::getEmptyKey();
3301   }
3302 
3303   static inline Instruction *getTombstoneKey() {
3304     return DenseMapInfo<Instruction *>::getTombstoneKey();
3305   }
3306 
3307   static unsigned getHashValue(const Instruction *I) {
3308     assert(canHandle(I) && "Unknown instruction!");
3309     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3310                                                            I->value_op_end()));
3311   }
3312 
3313   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3314     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3315         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3316       return LHS == RHS;
3317     return LHS->isIdenticalTo(RHS);
3318   }
3319 };
3320 
3321 } // end anonymous namespace
3322 
3323 ///Perform cse of induction variable instructions.
3324 static void cse(BasicBlock *BB) {
3325   // Perform simple cse.
3326   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3327   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3328     if (!CSEDenseMapInfo::canHandle(&In))
3329       continue;
3330 
3331     // Check if we can replace this instruction with any of the
3332     // visited instructions.
3333     if (Instruction *V = CSEMap.lookup(&In)) {
3334       In.replaceAllUsesWith(V);
3335       In.eraseFromParent();
3336       continue;
3337     }
3338 
3339     CSEMap[&In] = &In;
3340   }
3341 }
3342 
3343 InstructionCost
3344 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3345                                               ElementCount VF) const {
3346   // We only need to calculate a cost if the VF is scalar; for actual vectors
3347   // we should already have a pre-calculated cost at each VF.
3348   if (!VF.isScalar())
3349     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3350 
3351   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3352   Type *RetTy = CI->getType();
3353   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
3354     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3355       return *RedCost;
3356 
3357   SmallVector<Type *, 4> Tys;
3358   for (auto &ArgOp : CI->args())
3359     Tys.push_back(ArgOp->getType());
3360 
3361   InstructionCost ScalarCallCost =
3362       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
3363 
3364   // If this is an intrinsic we may have a lower cost for it.
3365   if (getVectorIntrinsicIDForCall(CI, TLI)) {
3366     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3367     return std::min(ScalarCallCost, IntrinsicCost);
3368   }
3369   return ScalarCallCost;
3370 }
3371 
3372 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3373   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3374     return Elt;
3375   return VectorType::get(Elt, VF);
3376 }
3377 
3378 InstructionCost
3379 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3380                                                    ElementCount VF) const {
3381   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3382   assert(ID && "Expected intrinsic call!");
3383   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3384   FastMathFlags FMF;
3385   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3386     FMF = FPMO->getFastMathFlags();
3387 
3388   SmallVector<const Value *> Arguments(CI->args());
3389   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3390   SmallVector<Type *> ParamTys;
3391   std::transform(FTy->param_begin(), FTy->param_end(),
3392                  std::back_inserter(ParamTys),
3393                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3394 
3395   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3396                                     dyn_cast<IntrinsicInst>(CI));
3397   return TTI.getIntrinsicInstrCost(CostAttrs,
3398                                    TargetTransformInfo::TCK_RecipThroughput);
3399 }
3400 
3401 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3402   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3403   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3404   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3405 }
3406 
3407 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3408   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3409   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3410   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3411 }
3412 
3413 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3414                                             VPlan &Plan) {
3415   // Fix widened non-induction PHIs by setting up the PHI operands.
3416   if (EnableVPlanNativePath)
3417     fixNonInductionPHIs(Plan, State);
3418 
3419   // At this point every instruction in the original loop is widened to a
3420   // vector form. Now we need to fix the recurrences in the loop. These PHI
3421   // nodes are currently empty because we did not want to introduce cycles.
3422   // This is the second stage of vectorizing recurrences. Note that fixing
3423   // reduction phis are already modeled in VPlan.
3424   // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3425   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3426   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3427   for (VPRecipeBase &R : HeaderVPBB->phis()) {
3428     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3429       fixFixedOrderRecurrence(FOR, State);
3430   }
3431 
3432   // Forget the original basic block.
3433   PSE.getSE()->forgetLoop(OrigLoop);
3434   PSE.getSE()->forgetBlockAndLoopDispositions();
3435 
3436   // After vectorization, the exit blocks of the original loop will have
3437   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3438   // looked through single-entry phis.
3439   SmallVector<BasicBlock *> ExitBlocks;
3440   OrigLoop->getExitBlocks(ExitBlocks);
3441   for (BasicBlock *Exit : ExitBlocks)
3442     for (PHINode &PN : Exit->phis())
3443       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3444 
3445   VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3446   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3447   if (Cost->requiresScalarEpilogue(VF.isVector())) {
3448     // No edge from the middle block to the unique exit block has been inserted
3449     // and there is nothing to fix from vector loop; phis should have incoming
3450     // from scalar loop only.
3451   } else {
3452     // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3453     // the cost model.
3454 
3455     // If we inserted an edge from the middle block to the unique exit block,
3456     // update uses outside the loop (phis) to account for the newly inserted
3457     // edge.
3458 
3459     // Fix-up external users of the induction variables.
3460     for (const auto &Entry : Legal->getInductionVars())
3461       fixupIVUsers(Entry.first, Entry.second,
3462                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3463                    IVEndValues[Entry.first], LoopMiddleBlock,
3464                    VectorLoop->getHeader(), Plan, State);
3465   }
3466 
3467   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3468   // in the exit block, so update the builder.
3469   State.Builder.SetInsertPoint(State.CFG.ExitBB,
3470                                State.CFG.ExitBB->getFirstNonPHIIt());
3471   for (const auto &KV : Plan.getLiveOuts())
3472     KV.second->fixPhi(Plan, State);
3473 
3474   for (Instruction *PI : PredicatedInstructions)
3475     sinkScalarOperands(&*PI);
3476 
3477   // Remove redundant induction instructions.
3478   cse(VectorLoop->getHeader());
3479 
3480   // Set/update profile weights for the vector and remainder loops as original
3481   // loop iterations are now distributed among them. Note that original loop
3482   // represented by LoopScalarBody becomes remainder loop after vectorization.
3483   //
3484   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3485   // end up getting slightly roughened result but that should be OK since
3486   // profile is not inherently precise anyway. Note also possible bypass of
3487   // vector code caused by legality checks is ignored, assigning all the weight
3488   // to the vector loop, optimistically.
3489   //
3490   // For scalable vectorization we can't know at compile time how many iterations
3491   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3492   // vscale of '1'.
3493   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3494                                LI->getLoopFor(LoopScalarBody),
3495                                VF.getKnownMinValue() * UF);
3496 }
3497 
3498 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3499     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3500   // This is the second phase of vectorizing first-order recurrences. An
3501   // overview of the transformation is described below. Suppose we have the
3502   // following loop.
3503   //
3504   //   for (int i = 0; i < n; ++i)
3505   //     b[i] = a[i] - a[i - 1];
3506   //
3507   // There is a first-order recurrence on "a". For this loop, the shorthand
3508   // scalar IR looks like:
3509   //
3510   //   scalar.ph:
3511   //     s_init = a[-1]
3512   //     br scalar.body
3513   //
3514   //   scalar.body:
3515   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3516   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3517   //     s2 = a[i]
3518   //     b[i] = s2 - s1
3519   //     br cond, scalar.body, ...
3520   //
3521   // In this example, s1 is a recurrence because it's value depends on the
3522   // previous iteration. In the first phase of vectorization, we created a
3523   // vector phi v1 for s1. We now complete the vectorization and produce the
3524   // shorthand vector IR shown below (for VF = 4, UF = 1).
3525   //
3526   //   vector.ph:
3527   //     v_init = vector(..., ..., ..., a[-1])
3528   //     br vector.body
3529   //
3530   //   vector.body
3531   //     i = phi [0, vector.ph], [i+4, vector.body]
3532   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3533   //     v2 = a[i, i+1, i+2, i+3];
3534   //     v3 = vector(v1(3), v2(0, 1, 2))
3535   //     b[i, i+1, i+2, i+3] = v2 - v3
3536   //     br cond, vector.body, middle.block
3537   //
3538   //   middle.block:
3539   //     x = v2(3)
3540   //     br scalar.ph
3541   //
3542   //   scalar.ph:
3543   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3544   //     br scalar.body
3545   //
3546   // After execution completes the vector loop, we extract the next value of
3547   // the recurrence (x) to use as the initial value in the scalar loop.
3548 
3549   // Extract the last vector element in the middle block. This will be the
3550   // initial value for the recurrence when jumping to the scalar loop.
3551   VPValue *PreviousDef = PhiR->getBackedgeValue();
3552   Value *Incoming = State.get(PreviousDef, UF - 1);
3553   auto *ExtractForScalar = Incoming;
3554   auto *IdxTy = Builder.getInt32Ty();
3555   Value *RuntimeVF = nullptr;
3556   if (VF.isVector()) {
3557     auto *One = ConstantInt::get(IdxTy, 1);
3558     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3559     RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3560     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3561     ExtractForScalar =
3562         Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3563   }
3564 
3565   auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3566   assert(PhiR->getNumUsers() == 1 &&
3567          RecurSplice->getOpcode() ==
3568              VPInstruction::FirstOrderRecurrenceSplice &&
3569          "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3570   SmallVector<VPLiveOut *> LiveOuts;
3571   for (VPUser *U : RecurSplice->users())
3572     if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3573       LiveOuts.push_back(LiveOut);
3574 
3575   if (!LiveOuts.empty()) {
3576     // Extract the second last element in the middle block if the
3577     // Phi is used outside the loop. We need to extract the phi itself
3578     // and not the last element (the phi update in the current iteration). This
3579     // will be the value when jumping to the exit block from the
3580     // LoopMiddleBlock, when the scalar loop is not run at all.
3581     Value *ExtractForPhiUsedOutsideLoop = nullptr;
3582     if (VF.isVector()) {
3583       auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3584       ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3585           Incoming, Idx, "vector.recur.extract.for.phi");
3586     } else {
3587       assert(UF > 1 && "VF and UF cannot both be 1");
3588       // When loop is unrolled without vectorizing, initialize
3589       // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3590       // value of `Incoming`. This is analogous to the vectorized case above:
3591       // extracting the second last element when VF > 1.
3592       ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3593     }
3594 
3595     for (VPLiveOut *LiveOut : LiveOuts) {
3596       assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3597       PHINode *LCSSAPhi = LiveOut->getPhi();
3598       LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3599       State.Plan->removeLiveOut(LCSSAPhi);
3600     }
3601   }
3602 
3603   // Fix the initial value of the original recurrence in the scalar loop.
3604   Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
3605   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3606   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3607   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3608   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3609     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3610     Start->addIncoming(Incoming, BB);
3611   }
3612 
3613   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3614   Phi->setName("scalar.recur");
3615 }
3616 
3617 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3618   // The basic block and loop containing the predicated instruction.
3619   auto *PredBB = PredInst->getParent();
3620   auto *VectorLoop = LI->getLoopFor(PredBB);
3621 
3622   // Initialize a worklist with the operands of the predicated instruction.
3623   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3624 
3625   // Holds instructions that we need to analyze again. An instruction may be
3626   // reanalyzed if we don't yet know if we can sink it or not.
3627   SmallVector<Instruction *, 8> InstsToReanalyze;
3628 
3629   // Returns true if a given use occurs in the predicated block. Phi nodes use
3630   // their operands in their corresponding predecessor blocks.
3631   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3632     auto *I = cast<Instruction>(U.getUser());
3633     BasicBlock *BB = I->getParent();
3634     if (auto *Phi = dyn_cast<PHINode>(I))
3635       BB = Phi->getIncomingBlock(
3636           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3637     return BB == PredBB;
3638   };
3639 
3640   // Iteratively sink the scalarized operands of the predicated instruction
3641   // into the block we created for it. When an instruction is sunk, it's
3642   // operands are then added to the worklist. The algorithm ends after one pass
3643   // through the worklist doesn't sink a single instruction.
3644   bool Changed;
3645   do {
3646     // Add the instructions that need to be reanalyzed to the worklist, and
3647     // reset the changed indicator.
3648     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3649     InstsToReanalyze.clear();
3650     Changed = false;
3651 
3652     while (!Worklist.empty()) {
3653       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3654 
3655       // We can't sink an instruction if it is a phi node, is not in the loop,
3656       // may have side effects or may read from memory.
3657       // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3658       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3659           I->mayHaveSideEffects() || I->mayReadFromMemory())
3660           continue;
3661 
3662       // If the instruction is already in PredBB, check if we can sink its
3663       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3664       // sinking the scalar instruction I, hence it appears in PredBB; but it
3665       // may have failed to sink I's operands (recursively), which we try
3666       // (again) here.
3667       if (I->getParent() == PredBB) {
3668         Worklist.insert(I->op_begin(), I->op_end());
3669         continue;
3670       }
3671 
3672       // It's legal to sink the instruction if all its uses occur in the
3673       // predicated block. Otherwise, there's nothing to do yet, and we may
3674       // need to reanalyze the instruction.
3675       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3676         InstsToReanalyze.push_back(I);
3677         continue;
3678       }
3679 
3680       // Move the instruction to the beginning of the predicated block, and add
3681       // it's operands to the worklist.
3682       I->moveBefore(&*PredBB->getFirstInsertionPt());
3683       Worklist.insert(I->op_begin(), I->op_end());
3684 
3685       // The sinking may have enabled other instructions to be sunk, so we will
3686       // need to iterate.
3687       Changed = true;
3688     }
3689   } while (Changed);
3690 }
3691 
3692 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3693                                               VPTransformState &State) {
3694   auto Iter = vp_depth_first_deep(Plan.getEntry());
3695   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3696     for (VPRecipeBase &P : VPBB->phis()) {
3697       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3698       if (!VPPhi)
3699         continue;
3700       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3701       // Make sure the builder has a valid insert point.
3702       Builder.SetInsertPoint(NewPhi);
3703       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3704         VPValue *Inc = VPPhi->getIncomingValue(i);
3705         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3706         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3707       }
3708     }
3709   }
3710 }
3711 
3712 bool InnerLoopVectorizer::useOrderedReductions(
3713     const RecurrenceDescriptor &RdxDesc) {
3714   return Cost->useOrderedReductions(RdxDesc);
3715 }
3716 
3717 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3718   // We should not collect Scalars more than once per VF. Right now, this
3719   // function is called from collectUniformsAndScalars(), which already does
3720   // this check. Collecting Scalars for VF=1 does not make any sense.
3721   assert(VF.isVector() && !Scalars.contains(VF) &&
3722          "This function should not be visited twice for the same VF");
3723 
3724   // This avoids any chances of creating a REPLICATE recipe during planning
3725   // since that would result in generation of scalarized code during execution,
3726   // which is not supported for scalable vectors.
3727   if (VF.isScalable()) {
3728     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3729     return;
3730   }
3731 
3732   SmallSetVector<Instruction *, 8> Worklist;
3733 
3734   // These sets are used to seed the analysis with pointers used by memory
3735   // accesses that will remain scalar.
3736   SmallSetVector<Instruction *, 8> ScalarPtrs;
3737   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3738   auto *Latch = TheLoop->getLoopLatch();
3739 
3740   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3741   // The pointer operands of loads and stores will be scalar as long as the
3742   // memory access is not a gather or scatter operation. The value operand of a
3743   // store will remain scalar if the store is scalarized.
3744   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3745     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3746     assert(WideningDecision != CM_Unknown &&
3747            "Widening decision should be ready at this moment");
3748     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3749       if (Ptr == Store->getValueOperand())
3750         return WideningDecision == CM_Scalarize;
3751     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3752            "Ptr is neither a value or pointer operand");
3753     return WideningDecision != CM_GatherScatter;
3754   };
3755 
3756   // A helper that returns true if the given value is a bitcast or
3757   // getelementptr instruction contained in the loop.
3758   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3759     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3760             isa<GetElementPtrInst>(V)) &&
3761            !TheLoop->isLoopInvariant(V);
3762   };
3763 
3764   // A helper that evaluates a memory access's use of a pointer. If the use will
3765   // be a scalar use and the pointer is only used by memory accesses, we place
3766   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3767   // PossibleNonScalarPtrs.
3768   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3769     // We only care about bitcast and getelementptr instructions contained in
3770     // the loop.
3771     if (!isLoopVaryingBitCastOrGEP(Ptr))
3772       return;
3773 
3774     // If the pointer has already been identified as scalar (e.g., if it was
3775     // also identified as uniform), there's nothing to do.
3776     auto *I = cast<Instruction>(Ptr);
3777     if (Worklist.count(I))
3778       return;
3779 
3780     // If the use of the pointer will be a scalar use, and all users of the
3781     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3782     // place the pointer in PossibleNonScalarPtrs.
3783     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3784           return isa<LoadInst>(U) || isa<StoreInst>(U);
3785         }))
3786       ScalarPtrs.insert(I);
3787     else
3788       PossibleNonScalarPtrs.insert(I);
3789   };
3790 
3791   // We seed the scalars analysis with three classes of instructions: (1)
3792   // instructions marked uniform-after-vectorization and (2) bitcast,
3793   // getelementptr and (pointer) phi instructions used by memory accesses
3794   // requiring a scalar use.
3795   //
3796   // (1) Add to the worklist all instructions that have been identified as
3797   // uniform-after-vectorization.
3798   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3799 
3800   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3801   // memory accesses requiring a scalar use. The pointer operands of loads and
3802   // stores will be scalar as long as the memory accesses is not a gather or
3803   // scatter operation. The value operand of a store will remain scalar if the
3804   // store is scalarized.
3805   for (auto *BB : TheLoop->blocks())
3806     for (auto &I : *BB) {
3807       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3808         evaluatePtrUse(Load, Load->getPointerOperand());
3809       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3810         evaluatePtrUse(Store, Store->getPointerOperand());
3811         evaluatePtrUse(Store, Store->getValueOperand());
3812       }
3813     }
3814   for (auto *I : ScalarPtrs)
3815     if (!PossibleNonScalarPtrs.count(I)) {
3816       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3817       Worklist.insert(I);
3818     }
3819 
3820   // Insert the forced scalars.
3821   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3822   // induction variable when the PHI user is scalarized.
3823   auto ForcedScalar = ForcedScalars.find(VF);
3824   if (ForcedScalar != ForcedScalars.end())
3825     for (auto *I : ForcedScalar->second) {
3826       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3827       Worklist.insert(I);
3828     }
3829 
3830   // Expand the worklist by looking through any bitcasts and getelementptr
3831   // instructions we've already identified as scalar. This is similar to the
3832   // expansion step in collectLoopUniforms(); however, here we're only
3833   // expanding to include additional bitcasts and getelementptr instructions.
3834   unsigned Idx = 0;
3835   while (Idx != Worklist.size()) {
3836     Instruction *Dst = Worklist[Idx++];
3837     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3838       continue;
3839     auto *Src = cast<Instruction>(Dst->getOperand(0));
3840     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3841           auto *J = cast<Instruction>(U);
3842           return !TheLoop->contains(J) || Worklist.count(J) ||
3843                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3844                   isScalarUse(J, Src));
3845         })) {
3846       Worklist.insert(Src);
3847       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3848     }
3849   }
3850 
3851   // An induction variable will remain scalar if all users of the induction
3852   // variable and induction variable update remain scalar.
3853   for (const auto &Induction : Legal->getInductionVars()) {
3854     auto *Ind = Induction.first;
3855     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3856 
3857     // If tail-folding is applied, the primary induction variable will be used
3858     // to feed a vector compare.
3859     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3860       continue;
3861 
3862     // Returns true if \p Indvar is a pointer induction that is used directly by
3863     // load/store instruction \p I.
3864     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3865                                               Instruction *I) {
3866       return Induction.second.getKind() ==
3867                  InductionDescriptor::IK_PtrInduction &&
3868              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3869              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3870     };
3871 
3872     // Determine if all users of the induction variable are scalar after
3873     // vectorization.
3874     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3875       auto *I = cast<Instruction>(U);
3876       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3877              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3878     });
3879     if (!ScalarInd)
3880       continue;
3881 
3882     // Determine if all users of the induction variable update instruction are
3883     // scalar after vectorization.
3884     auto ScalarIndUpdate =
3885         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3886           auto *I = cast<Instruction>(U);
3887           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3888                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3889         });
3890     if (!ScalarIndUpdate)
3891       continue;
3892 
3893     // The induction variable and its update instruction will remain scalar.
3894     Worklist.insert(Ind);
3895     Worklist.insert(IndUpdate);
3896     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3897     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3898                       << "\n");
3899   }
3900 
3901   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3902 }
3903 
3904 bool LoopVectorizationCostModel::isScalarWithPredication(
3905     Instruction *I, ElementCount VF) const {
3906   if (!isPredicatedInst(I))
3907     return false;
3908 
3909   // Do we have a non-scalar lowering for this predicated
3910   // instruction? No - it is scalar with predication.
3911   switch(I->getOpcode()) {
3912   default:
3913     return true;
3914   case Instruction::Call:
3915     if (VF.isScalar())
3916       return true;
3917     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3918                .Kind == CM_Scalarize;
3919   case Instruction::Load:
3920   case Instruction::Store: {
3921     auto *Ptr = getLoadStorePointerOperand(I);
3922     auto *Ty = getLoadStoreType(I);
3923     Type *VTy = Ty;
3924     if (VF.isVector())
3925       VTy = VectorType::get(Ty, VF);
3926     const Align Alignment = getLoadStoreAlignment(I);
3927     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3928                                 TTI.isLegalMaskedGather(VTy, Alignment))
3929                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3930                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3931   }
3932   case Instruction::UDiv:
3933   case Instruction::SDiv:
3934   case Instruction::SRem:
3935   case Instruction::URem: {
3936     // We have the option to use the safe-divisor idiom to avoid predication.
3937     // The cost based decision here will always select safe-divisor for
3938     // scalable vectors as scalarization isn't legal.
3939     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3940     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3941   }
3942   }
3943 }
3944 
3945 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3946   if (!blockNeedsPredicationForAnyReason(I->getParent()))
3947     return false;
3948 
3949   // Can we prove this instruction is safe to unconditionally execute?
3950   // If not, we must use some form of predication.
3951   switch(I->getOpcode()) {
3952   default:
3953     return false;
3954   case Instruction::Load:
3955   case Instruction::Store: {
3956     if (!Legal->isMaskRequired(I))
3957       return false;
3958     // When we know the load's address is loop invariant and the instruction
3959     // in the original scalar loop was unconditionally executed then we
3960     // don't need to mark it as a predicated instruction. Tail folding may
3961     // introduce additional predication, but we're guaranteed to always have
3962     // at least one active lane.  We call Legal->blockNeedsPredication here
3963     // because it doesn't query tail-folding.  For stores, we need to prove
3964     // both speculation safety (which follows from the same argument as loads),
3965     // but also must prove the value being stored is correct.  The easiest
3966     // form of the later is to require that all values stored are the same.
3967     if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3968         (isa<LoadInst>(I) ||
3969          (isa<StoreInst>(I) &&
3970           TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3971         !Legal->blockNeedsPredication(I->getParent()))
3972       return false;
3973     return true;
3974   }
3975   case Instruction::UDiv:
3976   case Instruction::SDiv:
3977   case Instruction::SRem:
3978   case Instruction::URem:
3979     // TODO: We can use the loop-preheader as context point here and get
3980     // context sensitive reasoning
3981     return !isSafeToSpeculativelyExecute(I);
3982   case Instruction::Call:
3983     return Legal->isMaskRequired(I);
3984   }
3985 }
3986 
3987 std::pair<InstructionCost, InstructionCost>
3988 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3989                                                     ElementCount VF) const {
3990   assert(I->getOpcode() == Instruction::UDiv ||
3991          I->getOpcode() == Instruction::SDiv ||
3992          I->getOpcode() == Instruction::SRem ||
3993          I->getOpcode() == Instruction::URem);
3994   assert(!isSafeToSpeculativelyExecute(I));
3995 
3996   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3997 
3998   // Scalarization isn't legal for scalable vector types
3999   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4000   if (!VF.isScalable()) {
4001     // Get the scalarization cost and scale this amount by the probability of
4002     // executing the predicated block. If the instruction is not predicated,
4003     // we fall through to the next case.
4004     ScalarizationCost = 0;
4005 
4006     // These instructions have a non-void type, so account for the phi nodes
4007     // that we will create. This cost is likely to be zero. The phi node
4008     // cost, if any, should be scaled by the block probability because it
4009     // models a copy at the end of each predicated block.
4010     ScalarizationCost += VF.getKnownMinValue() *
4011       TTI.getCFInstrCost(Instruction::PHI, CostKind);
4012 
4013     // The cost of the non-predicated instruction.
4014     ScalarizationCost += VF.getKnownMinValue() *
4015       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4016 
4017     // The cost of insertelement and extractelement instructions needed for
4018     // scalarization.
4019     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4020 
4021     // Scale the cost by the probability of executing the predicated blocks.
4022     // This assumes the predicated block for each vector lane is equally
4023     // likely.
4024     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4025   }
4026   InstructionCost SafeDivisorCost = 0;
4027 
4028   auto *VecTy = ToVectorTy(I->getType(), VF);
4029 
4030   // The cost of the select guard to ensure all lanes are well defined
4031   // after we speculate above any internal control flow.
4032   SafeDivisorCost += TTI.getCmpSelInstrCost(
4033     Instruction::Select, VecTy,
4034     ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4035     CmpInst::BAD_ICMP_PREDICATE, CostKind);
4036 
4037   // Certain instructions can be cheaper to vectorize if they have a constant
4038   // second vector operand. One example of this are shifts on x86.
4039   Value *Op2 = I->getOperand(1);
4040   auto Op2Info = TTI.getOperandInfo(Op2);
4041   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4042       Legal->isInvariant(Op2))
4043     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4044 
4045   SmallVector<const Value *, 4> Operands(I->operand_values());
4046   SafeDivisorCost += TTI.getArithmeticInstrCost(
4047     I->getOpcode(), VecTy, CostKind,
4048     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4049     Op2Info, Operands, I);
4050   return {ScalarizationCost, SafeDivisorCost};
4051 }
4052 
4053 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4054     Instruction *I, ElementCount VF) {
4055   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4056   assert(getWideningDecision(I, VF) == CM_Unknown &&
4057          "Decision should not be set yet.");
4058   auto *Group = getInterleavedAccessGroup(I);
4059   assert(Group && "Must have a group.");
4060 
4061   // If the instruction's allocated size doesn't equal it's type size, it
4062   // requires padding and will be scalarized.
4063   auto &DL = I->getModule()->getDataLayout();
4064   auto *ScalarTy = getLoadStoreType(I);
4065   if (hasIrregularType(ScalarTy, DL))
4066     return false;
4067 
4068   // If the group involves a non-integral pointer, we may not be able to
4069   // losslessly cast all values to a common type.
4070   unsigned InterleaveFactor = Group->getFactor();
4071   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4072   for (unsigned i = 0; i < InterleaveFactor; i++) {
4073     Instruction *Member = Group->getMember(i);
4074     if (!Member)
4075       continue;
4076     auto *MemberTy = getLoadStoreType(Member);
4077     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4078     // Don't coerce non-integral pointers to integers or vice versa.
4079     if (MemberNI != ScalarNI) {
4080       // TODO: Consider adding special nullptr value case here
4081       return false;
4082     } else if (MemberNI && ScalarNI &&
4083                ScalarTy->getPointerAddressSpace() !=
4084                MemberTy->getPointerAddressSpace()) {
4085       return false;
4086     }
4087   }
4088 
4089   // Check if masking is required.
4090   // A Group may need masking for one of two reasons: it resides in a block that
4091   // needs predication, or it was decided to use masking to deal with gaps
4092   // (either a gap at the end of a load-access that may result in a speculative
4093   // load, or any gaps in a store-access).
4094   bool PredicatedAccessRequiresMasking =
4095       blockNeedsPredicationForAnyReason(I->getParent()) &&
4096       Legal->isMaskRequired(I);
4097   bool LoadAccessWithGapsRequiresEpilogMasking =
4098       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4099       !isScalarEpilogueAllowed();
4100   bool StoreAccessWithGapsRequiresMasking =
4101       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4102   if (!PredicatedAccessRequiresMasking &&
4103       !LoadAccessWithGapsRequiresEpilogMasking &&
4104       !StoreAccessWithGapsRequiresMasking)
4105     return true;
4106 
4107   // If masked interleaving is required, we expect that the user/target had
4108   // enabled it, because otherwise it either wouldn't have been created or
4109   // it should have been invalidated by the CostModel.
4110   assert(useMaskedInterleavedAccesses(TTI) &&
4111          "Masked interleave-groups for predicated accesses are not enabled.");
4112 
4113   if (Group->isReverse())
4114     return false;
4115 
4116   auto *Ty = getLoadStoreType(I);
4117   const Align Alignment = getLoadStoreAlignment(I);
4118   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4119                           : TTI.isLegalMaskedStore(Ty, Alignment);
4120 }
4121 
4122 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4123     Instruction *I, ElementCount VF) {
4124   // Get and ensure we have a valid memory instruction.
4125   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4126 
4127   auto *Ptr = getLoadStorePointerOperand(I);
4128   auto *ScalarTy = getLoadStoreType(I);
4129 
4130   // In order to be widened, the pointer should be consecutive, first of all.
4131   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4132     return false;
4133 
4134   // If the instruction is a store located in a predicated block, it will be
4135   // scalarized.
4136   if (isScalarWithPredication(I, VF))
4137     return false;
4138 
4139   // If the instruction's allocated size doesn't equal it's type size, it
4140   // requires padding and will be scalarized.
4141   auto &DL = I->getModule()->getDataLayout();
4142   if (hasIrregularType(ScalarTy, DL))
4143     return false;
4144 
4145   return true;
4146 }
4147 
4148 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4149   // We should not collect Uniforms more than once per VF. Right now,
4150   // this function is called from collectUniformsAndScalars(), which
4151   // already does this check. Collecting Uniforms for VF=1 does not make any
4152   // sense.
4153 
4154   assert(VF.isVector() && !Uniforms.contains(VF) &&
4155          "This function should not be visited twice for the same VF");
4156 
4157   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4158   // not analyze again.  Uniforms.count(VF) will return 1.
4159   Uniforms[VF].clear();
4160 
4161   // We now know that the loop is vectorizable!
4162   // Collect instructions inside the loop that will remain uniform after
4163   // vectorization.
4164 
4165   // Global values, params and instructions outside of current loop are out of
4166   // scope.
4167   auto isOutOfScope = [&](Value *V) -> bool {
4168     Instruction *I = dyn_cast<Instruction>(V);
4169     return (!I || !TheLoop->contains(I));
4170   };
4171 
4172   // Worklist containing uniform instructions demanding lane 0.
4173   SetVector<Instruction *> Worklist;
4174   BasicBlock *Latch = TheLoop->getLoopLatch();
4175 
4176   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4177   // that are scalar with predication must not be considered uniform after
4178   // vectorization, because that would create an erroneous replicating region
4179   // where only a single instance out of VF should be formed.
4180   // TODO: optimize such seldom cases if found important, see PR40816.
4181   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4182     if (isOutOfScope(I)) {
4183       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4184                         << *I << "\n");
4185       return;
4186     }
4187     if (isScalarWithPredication(I, VF)) {
4188       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4189                         << *I << "\n");
4190       return;
4191     }
4192     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4193     Worklist.insert(I);
4194   };
4195 
4196   // Start with the conditional branch. If the branch condition is an
4197   // instruction contained in the loop that is only used by the branch, it is
4198   // uniform.
4199   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4200   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4201     addToWorklistIfAllowed(Cmp);
4202 
4203   auto PrevVF = VF.divideCoefficientBy(2);
4204   // Return true if all lanes perform the same memory operation, and we can
4205   // thus chose to execute only one.
4206   auto isUniformMemOpUse = [&](Instruction *I) {
4207     // If the value was already known to not be uniform for the previous
4208     // (smaller VF), it cannot be uniform for the larger VF.
4209     if (PrevVF.isVector()) {
4210       auto Iter = Uniforms.find(PrevVF);
4211       if (Iter != Uniforms.end() && !Iter->second.contains(I))
4212         return false;
4213     }
4214     if (!Legal->isUniformMemOp(*I, VF))
4215       return false;
4216     if (isa<LoadInst>(I))
4217       // Loading the same address always produces the same result - at least
4218       // assuming aliasing and ordering which have already been checked.
4219       return true;
4220     // Storing the same value on every iteration.
4221     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4222   };
4223 
4224   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4225     InstWidening WideningDecision = getWideningDecision(I, VF);
4226     assert(WideningDecision != CM_Unknown &&
4227            "Widening decision should be ready at this moment");
4228 
4229     if (isUniformMemOpUse(I))
4230       return true;
4231 
4232     return (WideningDecision == CM_Widen ||
4233             WideningDecision == CM_Widen_Reverse ||
4234             WideningDecision == CM_Interleave);
4235   };
4236 
4237   // Returns true if Ptr is the pointer operand of a memory access instruction
4238   // I, I is known to not require scalarization, and the pointer is not also
4239   // stored.
4240   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4241     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4242       return false;
4243     return getLoadStorePointerOperand(I) == Ptr &&
4244            (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4245   };
4246 
4247   // Holds a list of values which are known to have at least one uniform use.
4248   // Note that there may be other uses which aren't uniform.  A "uniform use"
4249   // here is something which only demands lane 0 of the unrolled iterations;
4250   // it does not imply that all lanes produce the same value (e.g. this is not
4251   // the usual meaning of uniform)
4252   SetVector<Value *> HasUniformUse;
4253 
4254   // Scan the loop for instructions which are either a) known to have only
4255   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4256   for (auto *BB : TheLoop->blocks())
4257     for (auto &I : *BB) {
4258       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4259         switch (II->getIntrinsicID()) {
4260         case Intrinsic::sideeffect:
4261         case Intrinsic::experimental_noalias_scope_decl:
4262         case Intrinsic::assume:
4263         case Intrinsic::lifetime_start:
4264         case Intrinsic::lifetime_end:
4265           if (TheLoop->hasLoopInvariantOperands(&I))
4266             addToWorklistIfAllowed(&I);
4267           break;
4268         default:
4269           break;
4270         }
4271       }
4272 
4273       // ExtractValue instructions must be uniform, because the operands are
4274       // known to be loop-invariant.
4275       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4276         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4277                "Expected aggregate value to be loop invariant");
4278         addToWorklistIfAllowed(EVI);
4279         continue;
4280       }
4281 
4282       // If there's no pointer operand, there's nothing to do.
4283       auto *Ptr = getLoadStorePointerOperand(&I);
4284       if (!Ptr)
4285         continue;
4286 
4287       if (isUniformMemOpUse(&I))
4288         addToWorklistIfAllowed(&I);
4289 
4290       if (isVectorizedMemAccessUse(&I, Ptr))
4291         HasUniformUse.insert(Ptr);
4292     }
4293 
4294   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4295   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4296   // disallows uses outside the loop as well.
4297   for (auto *V : HasUniformUse) {
4298     if (isOutOfScope(V))
4299       continue;
4300     auto *I = cast<Instruction>(V);
4301     auto UsersAreMemAccesses =
4302       llvm::all_of(I->users(), [&](User *U) -> bool {
4303         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4304       });
4305     if (UsersAreMemAccesses)
4306       addToWorklistIfAllowed(I);
4307   }
4308 
4309   // Expand Worklist in topological order: whenever a new instruction
4310   // is added , its users should be already inside Worklist.  It ensures
4311   // a uniform instruction will only be used by uniform instructions.
4312   unsigned idx = 0;
4313   while (idx != Worklist.size()) {
4314     Instruction *I = Worklist[idx++];
4315 
4316     for (auto *OV : I->operand_values()) {
4317       // isOutOfScope operands cannot be uniform instructions.
4318       if (isOutOfScope(OV))
4319         continue;
4320       // First order recurrence Phi's should typically be considered
4321       // non-uniform.
4322       auto *OP = dyn_cast<PHINode>(OV);
4323       if (OP && Legal->isFixedOrderRecurrence(OP))
4324         continue;
4325       // If all the users of the operand are uniform, then add the
4326       // operand into the uniform worklist.
4327       auto *OI = cast<Instruction>(OV);
4328       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4329             auto *J = cast<Instruction>(U);
4330             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4331           }))
4332         addToWorklistIfAllowed(OI);
4333     }
4334   }
4335 
4336   // For an instruction to be added into Worklist above, all its users inside
4337   // the loop should also be in Worklist. However, this condition cannot be
4338   // true for phi nodes that form a cyclic dependence. We must process phi
4339   // nodes separately. An induction variable will remain uniform if all users
4340   // of the induction variable and induction variable update remain uniform.
4341   // The code below handles both pointer and non-pointer induction variables.
4342   for (const auto &Induction : Legal->getInductionVars()) {
4343     auto *Ind = Induction.first;
4344     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4345 
4346     // Determine if all users of the induction variable are uniform after
4347     // vectorization.
4348     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4349       auto *I = cast<Instruction>(U);
4350       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4351              isVectorizedMemAccessUse(I, Ind);
4352     });
4353     if (!UniformInd)
4354       continue;
4355 
4356     // Determine if all users of the induction variable update instruction are
4357     // uniform after vectorization.
4358     auto UniformIndUpdate =
4359         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4360           auto *I = cast<Instruction>(U);
4361           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4362                  isVectorizedMemAccessUse(I, IndUpdate);
4363         });
4364     if (!UniformIndUpdate)
4365       continue;
4366 
4367     // The induction variable and its update instruction will remain uniform.
4368     addToWorklistIfAllowed(Ind);
4369     addToWorklistIfAllowed(IndUpdate);
4370   }
4371 
4372   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4373 }
4374 
4375 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4376   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4377 
4378   if (Legal->getRuntimePointerChecking()->Need) {
4379     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4380         "runtime pointer checks needed. Enable vectorization of this "
4381         "loop with '#pragma clang loop vectorize(enable)' when "
4382         "compiling with -Os/-Oz",
4383         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4384     return true;
4385   }
4386 
4387   if (!PSE.getPredicate().isAlwaysTrue()) {
4388     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4389         "runtime SCEV checks needed. Enable vectorization of this "
4390         "loop with '#pragma clang loop vectorize(enable)' when "
4391         "compiling with -Os/-Oz",
4392         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4393     return true;
4394   }
4395 
4396   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4397   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4398     reportVectorizationFailure("Runtime stride check for small trip count",
4399         "runtime stride == 1 checks needed. Enable vectorization of "
4400         "this loop without such check by compiling with -Os/-Oz",
4401         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4402     return true;
4403   }
4404 
4405   return false;
4406 }
4407 
4408 ElementCount
4409 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4410   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4411     return ElementCount::getScalable(0);
4412 
4413   if (Hints->isScalableVectorizationDisabled()) {
4414     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4415                             "ScalableVectorizationDisabled", ORE, TheLoop);
4416     return ElementCount::getScalable(0);
4417   }
4418 
4419   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4420 
4421   auto MaxScalableVF = ElementCount::getScalable(
4422       std::numeric_limits<ElementCount::ScalarTy>::max());
4423 
4424   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4425   // FIXME: While for scalable vectors this is currently sufficient, this should
4426   // be replaced by a more detailed mechanism that filters out specific VFs,
4427   // instead of invalidating vectorization for a whole set of VFs based on the
4428   // MaxVF.
4429 
4430   // Disable scalable vectorization if the loop contains unsupported reductions.
4431   if (!canVectorizeReductions(MaxScalableVF)) {
4432     reportVectorizationInfo(
4433         "Scalable vectorization not supported for the reduction "
4434         "operations found in this loop.",
4435         "ScalableVFUnfeasible", ORE, TheLoop);
4436     return ElementCount::getScalable(0);
4437   }
4438 
4439   // Disable scalable vectorization if the loop contains any instructions
4440   // with element types not supported for scalable vectors.
4441   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4442         return !Ty->isVoidTy() &&
4443                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4444       })) {
4445     reportVectorizationInfo("Scalable vectorization is not supported "
4446                             "for all element types found in this loop.",
4447                             "ScalableVFUnfeasible", ORE, TheLoop);
4448     return ElementCount::getScalable(0);
4449   }
4450 
4451   if (Legal->isSafeForAnyVectorWidth())
4452     return MaxScalableVF;
4453 
4454   // Limit MaxScalableVF by the maximum safe dependence distance.
4455   if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4456     MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4457   else
4458     MaxScalableVF = ElementCount::getScalable(0);
4459 
4460   if (!MaxScalableVF)
4461     reportVectorizationInfo(
4462         "Max legal vector width too small, scalable vectorization "
4463         "unfeasible.",
4464         "ScalableVFUnfeasible", ORE, TheLoop);
4465 
4466   return MaxScalableVF;
4467 }
4468 
4469 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4470     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4471   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4472   unsigned SmallestType, WidestType;
4473   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4474 
4475   // Get the maximum safe dependence distance in bits computed by LAA.
4476   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4477   // the memory accesses that is most restrictive (involved in the smallest
4478   // dependence distance).
4479   unsigned MaxSafeElements =
4480       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4481 
4482   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4483   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4484 
4485   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4486                     << ".\n");
4487   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4488                     << ".\n");
4489 
4490   // First analyze the UserVF, fall back if the UserVF should be ignored.
4491   if (UserVF) {
4492     auto MaxSafeUserVF =
4493         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4494 
4495     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4496       // If `VF=vscale x N` is safe, then so is `VF=N`
4497       if (UserVF.isScalable())
4498         return FixedScalableVFPair(
4499             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4500       else
4501         return UserVF;
4502     }
4503 
4504     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4505 
4506     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4507     // is better to ignore the hint and let the compiler choose a suitable VF.
4508     if (!UserVF.isScalable()) {
4509       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4510                         << " is unsafe, clamping to max safe VF="
4511                         << MaxSafeFixedVF << ".\n");
4512       ORE->emit([&]() {
4513         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4514                                           TheLoop->getStartLoc(),
4515                                           TheLoop->getHeader())
4516                << "User-specified vectorization factor "
4517                << ore::NV("UserVectorizationFactor", UserVF)
4518                << " is unsafe, clamping to maximum safe vectorization factor "
4519                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4520       });
4521       return MaxSafeFixedVF;
4522     }
4523 
4524     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4525       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4526                         << " is ignored because scalable vectors are not "
4527                            "available.\n");
4528       ORE->emit([&]() {
4529         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4530                                           TheLoop->getStartLoc(),
4531                                           TheLoop->getHeader())
4532                << "User-specified vectorization factor "
4533                << ore::NV("UserVectorizationFactor", UserVF)
4534                << " is ignored because the target does not support scalable "
4535                   "vectors. The compiler will pick a more suitable value.";
4536       });
4537     } else {
4538       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4539                         << " is unsafe. Ignoring scalable UserVF.\n");
4540       ORE->emit([&]() {
4541         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4542                                           TheLoop->getStartLoc(),
4543                                           TheLoop->getHeader())
4544                << "User-specified vectorization factor "
4545                << ore::NV("UserVectorizationFactor", UserVF)
4546                << " is unsafe. Ignoring the hint to let the compiler pick a "
4547                   "more suitable value.";
4548       });
4549     }
4550   }
4551 
4552   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4553                     << " / " << WidestType << " bits.\n");
4554 
4555   FixedScalableVFPair Result(ElementCount::getFixed(1),
4556                              ElementCount::getScalable(0));
4557   if (auto MaxVF =
4558           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4559                                   MaxSafeFixedVF, FoldTailByMasking))
4560     Result.FixedVF = MaxVF;
4561 
4562   if (auto MaxVF =
4563           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4564                                   MaxSafeScalableVF, FoldTailByMasking))
4565     if (MaxVF.isScalable()) {
4566       Result.ScalableVF = MaxVF;
4567       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4568                         << "\n");
4569     }
4570 
4571   return Result;
4572 }
4573 
4574 FixedScalableVFPair
4575 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4576   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4577     // TODO: It may by useful to do since it's still likely to be dynamically
4578     // uniform if the target can skip.
4579     reportVectorizationFailure(
4580         "Not inserting runtime ptr check for divergent target",
4581         "runtime pointer checks needed. Not enabled for divergent target",
4582         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4583     return FixedScalableVFPair::getNone();
4584   }
4585 
4586   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4587   unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4588   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4589   if (TC == 1) {
4590     reportVectorizationFailure("Single iteration (non) loop",
4591         "loop trip count is one, irrelevant for vectorization",
4592         "SingleIterationLoop", ORE, TheLoop);
4593     return FixedScalableVFPair::getNone();
4594   }
4595 
4596   switch (ScalarEpilogueStatus) {
4597   case CM_ScalarEpilogueAllowed:
4598     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4599   case CM_ScalarEpilogueNotAllowedUsePredicate:
4600     [[fallthrough]];
4601   case CM_ScalarEpilogueNotNeededUsePredicate:
4602     LLVM_DEBUG(
4603         dbgs() << "LV: vector predicate hint/switch found.\n"
4604                << "LV: Not allowing scalar epilogue, creating predicated "
4605                << "vector loop.\n");
4606     break;
4607   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4608     // fallthrough as a special case of OptForSize
4609   case CM_ScalarEpilogueNotAllowedOptSize:
4610     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4611       LLVM_DEBUG(
4612           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4613     else
4614       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4615                         << "count.\n");
4616 
4617     // Bail if runtime checks are required, which are not good when optimising
4618     // for size.
4619     if (runtimeChecksRequired())
4620       return FixedScalableVFPair::getNone();
4621 
4622     break;
4623   }
4624 
4625   // The only loops we can vectorize without a scalar epilogue, are loops with
4626   // a bottom-test and a single exiting block. We'd have to handle the fact
4627   // that not every instruction executes on the last iteration.  This will
4628   // require a lane mask which varies through the vector loop body.  (TODO)
4629   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4630     // If there was a tail-folding hint/switch, but we can't fold the tail by
4631     // masking, fallback to a vectorization with a scalar epilogue.
4632     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4633       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4634                            "scalar epilogue instead.\n");
4635       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4636       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4637     }
4638     return FixedScalableVFPair::getNone();
4639   }
4640 
4641   // Now try the tail folding
4642 
4643   // Invalidate interleave groups that require an epilogue if we can't mask
4644   // the interleave-group.
4645   if (!useMaskedInterleavedAccesses(TTI)) {
4646     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4647            "No decisions should have been taken at this point");
4648     // Note: There is no need to invalidate any cost modeling decisions here, as
4649     // non where taken so far.
4650     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4651   }
4652 
4653   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4654 
4655   // Avoid tail folding if the trip count is known to be a multiple of any VF
4656   // we choose.
4657   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4658       MaxFactors.FixedVF.getFixedValue();
4659   if (MaxFactors.ScalableVF) {
4660     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4661     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4662       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4663           *MaxPowerOf2RuntimeVF,
4664           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4665     } else
4666       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4667   }
4668 
4669   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4670     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4671            "MaxFixedVF must be a power of 2");
4672     unsigned MaxVFtimesIC =
4673         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4674     ScalarEvolution *SE = PSE.getSE();
4675     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4676     const SCEV *ExitCount = SE->getAddExpr(
4677         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4678     const SCEV *Rem = SE->getURemExpr(
4679         SE->applyLoopGuards(ExitCount, TheLoop),
4680         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4681     if (Rem->isZero()) {
4682       // Accept MaxFixedVF if we do not have a tail.
4683       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4684       return MaxFactors;
4685     }
4686   }
4687 
4688   // If we don't know the precise trip count, or if the trip count that we
4689   // found modulo the vectorization factor is not zero, try to fold the tail
4690   // by masking.
4691   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4692   if (Legal->prepareToFoldTailByMasking()) {
4693     CanFoldTailByMasking = true;
4694     return MaxFactors;
4695   }
4696 
4697   // If there was a tail-folding hint/switch, but we can't fold the tail by
4698   // masking, fallback to a vectorization with a scalar epilogue.
4699   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4700     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4701                          "scalar epilogue instead.\n");
4702     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4703     return MaxFactors;
4704   }
4705 
4706   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4707     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4708     return FixedScalableVFPair::getNone();
4709   }
4710 
4711   if (TC == 0) {
4712     reportVectorizationFailure(
4713         "Unable to calculate the loop count due to complex control flow",
4714         "unable to calculate the loop count due to complex control flow",
4715         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4716     return FixedScalableVFPair::getNone();
4717   }
4718 
4719   reportVectorizationFailure(
4720       "Cannot optimize for size and vectorize at the same time.",
4721       "cannot optimize for size and vectorize at the same time. "
4722       "Enable vectorization of this loop with '#pragma clang loop "
4723       "vectorize(enable)' when compiling with -Os/-Oz",
4724       "NoTailLoopWithOptForSize", ORE, TheLoop);
4725   return FixedScalableVFPair::getNone();
4726 }
4727 
4728 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4729     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4730     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4731   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4732   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4733       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4734                            : TargetTransformInfo::RGK_FixedWidthVector);
4735 
4736   // Convenience function to return the minimum of two ElementCounts.
4737   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4738     assert((LHS.isScalable() == RHS.isScalable()) &&
4739            "Scalable flags must match");
4740     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4741   };
4742 
4743   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4744   // Note that both WidestRegister and WidestType may not be a powers of 2.
4745   auto MaxVectorElementCount = ElementCount::get(
4746       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4747       ComputeScalableMaxVF);
4748   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4749   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4750                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4751 
4752   if (!MaxVectorElementCount) {
4753     LLVM_DEBUG(dbgs() << "LV: The target has no "
4754                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4755                       << " vector registers.\n");
4756     return ElementCount::getFixed(1);
4757   }
4758 
4759   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4760   if (MaxVectorElementCount.isScalable() &&
4761       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4762     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4763     auto Min = Attr.getVScaleRangeMin();
4764     WidestRegisterMinEC *= Min;
4765   }
4766 
4767   // When a scalar epilogue is required, at least one iteration of the scalar
4768   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4769   // max VF that results in a dead vector loop.
4770   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4771     MaxTripCount -= 1;
4772 
4773   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4774       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4775     // If upper bound loop trip count (TC) is known at compile time there is no
4776     // point in choosing VF greater than TC (as done in the loop below). Select
4777     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4778     // scalable, we only fall back on a fixed VF when the TC is less than or
4779     // equal to the known number of lanes.
4780     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4781     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4782                          "exceeding the constant trip count: "
4783                       << ClampedUpperTripCount << "\n");
4784     return ElementCount::get(
4785         ClampedUpperTripCount,
4786         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4787   }
4788 
4789   TargetTransformInfo::RegisterKind RegKind =
4790       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4791                            : TargetTransformInfo::RGK_FixedWidthVector;
4792   ElementCount MaxVF = MaxVectorElementCount;
4793   if (MaximizeBandwidth ||
4794       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4795        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4796         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4797     auto MaxVectorElementCountMaxBW = ElementCount::get(
4798         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4799         ComputeScalableMaxVF);
4800     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4801 
4802     // Collect all viable vectorization factors larger than the default MaxVF
4803     // (i.e. MaxVectorElementCount).
4804     SmallVector<ElementCount, 8> VFs;
4805     for (ElementCount VS = MaxVectorElementCount * 2;
4806          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4807       VFs.push_back(VS);
4808 
4809     // For each VF calculate its register usage.
4810     auto RUs = calculateRegisterUsage(VFs);
4811 
4812     // Select the largest VF which doesn't require more registers than existing
4813     // ones.
4814     for (int i = RUs.size() - 1; i >= 0; --i) {
4815       bool Selected = true;
4816       for (auto &pair : RUs[i].MaxLocalUsers) {
4817         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4818         if (pair.second > TargetNumRegisters)
4819           Selected = false;
4820       }
4821       if (Selected) {
4822         MaxVF = VFs[i];
4823         break;
4824       }
4825     }
4826     if (ElementCount MinVF =
4827             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4828       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4829         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4830                           << ") with target's minimum: " << MinVF << '\n');
4831         MaxVF = MinVF;
4832       }
4833     }
4834 
4835     // Invalidate any widening decisions we might have made, in case the loop
4836     // requires prediction (decided later), but we have already made some
4837     // load/store widening decisions.
4838     invalidateCostModelingDecisions();
4839   }
4840   return MaxVF;
4841 }
4842 
4843 /// Convenience function that returns the value of vscale_range iff
4844 /// vscale_range.min == vscale_range.max or otherwise returns the value
4845 /// returned by the corresponding TTI method.
4846 static std::optional<unsigned>
4847 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4848   const Function *Fn = L->getHeader()->getParent();
4849   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4850     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4851     auto Min = Attr.getVScaleRangeMin();
4852     auto Max = Attr.getVScaleRangeMax();
4853     if (Max && Min == Max)
4854       return Max;
4855   }
4856 
4857   return TTI.getVScaleForTuning();
4858 }
4859 
4860 bool LoopVectorizationPlanner::isMoreProfitable(
4861     const VectorizationFactor &A, const VectorizationFactor &B) const {
4862   InstructionCost CostA = A.Cost;
4863   InstructionCost CostB = B.Cost;
4864 
4865   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4866 
4867   if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4868     // If the trip count is a known (possibly small) constant, the trip count
4869     // will be rounded up to an integer number of iterations under
4870     // FoldTailByMasking. The total cost in that case will be
4871     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4872     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4873     // some extra overheads, but for the purpose of comparing the costs of
4874     // different VFs we can use this to compare the total loop-body cost
4875     // expected after vectorization.
4876     auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4877                                              InstructionCost VectorCost,
4878                                              InstructionCost ScalarCost) {
4879       return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4880                                     : VectorCost * (MaxTripCount / VF) +
4881                                           ScalarCost * (MaxTripCount % VF);
4882     };
4883     auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4884     auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4885 
4886     return RTCostA < RTCostB;
4887   }
4888 
4889   // Improve estimate for the vector width if it is scalable.
4890   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4891   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4892   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4893     if (A.Width.isScalable())
4894       EstimatedWidthA *= *VScale;
4895     if (B.Width.isScalable())
4896       EstimatedWidthB *= *VScale;
4897   }
4898 
4899   // Assume vscale may be larger than 1 (or the value being tuned for),
4900   // so that scalable vectorization is slightly favorable over fixed-width
4901   // vectorization.
4902   if (A.Width.isScalable() && !B.Width.isScalable())
4903     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4904 
4905   // To avoid the need for FP division:
4906   //      (CostA / A.Width) < (CostB / B.Width)
4907   // <=>  (CostA * B.Width) < (CostB * A.Width)
4908   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4909 }
4910 
4911 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4912                                    OptimizationRemarkEmitter *ORE,
4913                                    Loop *TheLoop) {
4914   if (InvalidCosts.empty())
4915     return;
4916 
4917   // Emit a report of VFs with invalid costs in the loop.
4918 
4919   // Group the remarks per instruction, keeping the instruction order from
4920   // InvalidCosts.
4921   std::map<Instruction *, unsigned> Numbering;
4922   unsigned I = 0;
4923   for (auto &Pair : InvalidCosts)
4924     if (!Numbering.count(Pair.first))
4925       Numbering[Pair.first] = I++;
4926 
4927   // Sort the list, first on instruction(number) then on VF.
4928   sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4929     if (Numbering[A.first] != Numbering[B.first])
4930       return Numbering[A.first] < Numbering[B.first];
4931     ElementCountComparator ECC;
4932     return ECC(A.second, B.second);
4933   });
4934 
4935   // For a list of ordered instruction-vf pairs:
4936   //   [(load, vf1), (load, vf2), (store, vf1)]
4937   // Group the instructions together to emit separate remarks for:
4938   //   load  (vf1, vf2)
4939   //   store (vf1)
4940   auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4941   auto Subset = ArrayRef<InstructionVFPair>();
4942   do {
4943     if (Subset.empty())
4944       Subset = Tail.take_front(1);
4945 
4946     Instruction *I = Subset.front().first;
4947 
4948     // If the next instruction is different, or if there are no other pairs,
4949     // emit a remark for the collated subset. e.g.
4950     //   [(load, vf1), (load, vf2))]
4951     // to emit:
4952     //  remark: invalid costs for 'load' at VF=(vf, vf2)
4953     if (Subset == Tail || Tail[Subset.size()].first != I) {
4954       std::string OutString;
4955       raw_string_ostream OS(OutString);
4956       assert(!Subset.empty() && "Unexpected empty range");
4957       OS << "Instruction with invalid costs prevented vectorization at VF=(";
4958       for (const auto &Pair : Subset)
4959         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4960       OS << "):";
4961       if (auto *CI = dyn_cast<CallInst>(I))
4962         OS << " call to " << CI->getCalledFunction()->getName();
4963       else
4964         OS << " " << I->getOpcodeName();
4965       OS.flush();
4966       reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4967       Tail = Tail.drop_front(Subset.size());
4968       Subset = {};
4969     } else
4970       // Grow the subset by one element
4971       Subset = Tail.take_front(Subset.size() + 1);
4972   } while (!Tail.empty());
4973 }
4974 
4975 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
4976     const ElementCountSet &VFCandidates) {
4977   InstructionCost ExpectedCost =
4978       CM.expectedCost(ElementCount::getFixed(1)).first;
4979   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4980   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4981   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
4982          "Expected Scalar VF to be a candidate");
4983 
4984   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4985                                        ExpectedCost);
4986   VectorizationFactor ChosenFactor = ScalarCost;
4987 
4988   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4989   if (ForceVectorization && VFCandidates.size() > 1) {
4990     // Ignore scalar width, because the user explicitly wants vectorization.
4991     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4992     // evaluation.
4993     ChosenFactor.Cost = InstructionCost::getMax();
4994   }
4995 
4996   SmallVector<InstructionVFPair> InvalidCosts;
4997   for (const auto &i : VFCandidates) {
4998     // The cost for scalar VF=1 is already calculated, so ignore it.
4999     if (i.isScalar())
5000       continue;
5001 
5002     LoopVectorizationCostModel::VectorizationCostTy C =
5003         CM.expectedCost(i, &InvalidCosts);
5004     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5005 
5006 #ifndef NDEBUG
5007     unsigned AssumedMinimumVscale = 1;
5008     if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5009       AssumedMinimumVscale = *VScale;
5010     unsigned Width =
5011         Candidate.Width.isScalable()
5012             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5013             : Candidate.Width.getFixedValue();
5014     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5015                       << " costs: " << (Candidate.Cost / Width));
5016     if (i.isScalable())
5017       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5018                         << AssumedMinimumVscale << ")");
5019     LLVM_DEBUG(dbgs() << ".\n");
5020 #endif
5021 
5022     if (!C.second && !ForceVectorization) {
5023       LLVM_DEBUG(
5024           dbgs() << "LV: Not considering vector loop of width " << i
5025                  << " because it will not generate any vector instructions.\n");
5026       continue;
5027     }
5028 
5029     // If profitable add it to ProfitableVF list.
5030     if (isMoreProfitable(Candidate, ScalarCost))
5031       ProfitableVFs.push_back(Candidate);
5032 
5033     if (isMoreProfitable(Candidate, ChosenFactor))
5034       ChosenFactor = Candidate;
5035   }
5036 
5037   emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5038 
5039   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5040     reportVectorizationFailure(
5041         "There are conditional stores.",
5042         "store that is conditionally executed prevents vectorization",
5043         "ConditionalStore", ORE, OrigLoop);
5044     ChosenFactor = ScalarCost;
5045   }
5046 
5047   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5048                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5049              << "LV: Vectorization seems to be not beneficial, "
5050              << "but was forced by a user.\n");
5051   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5052   return ChosenFactor;
5053 }
5054 
5055 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5056     ElementCount VF) const {
5057   // Cross iteration phis such as reductions need special handling and are
5058   // currently unsupported.
5059   if (any_of(OrigLoop->getHeader()->phis(),
5060              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5061     return false;
5062 
5063   // Phis with uses outside of the loop require special handling and are
5064   // currently unsupported.
5065   for (const auto &Entry : Legal->getInductionVars()) {
5066     // Look for uses of the value of the induction at the last iteration.
5067     Value *PostInc =
5068         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5069     for (User *U : PostInc->users())
5070       if (!OrigLoop->contains(cast<Instruction>(U)))
5071         return false;
5072     // Look for uses of penultimate value of the induction.
5073     for (User *U : Entry.first->users())
5074       if (!OrigLoop->contains(cast<Instruction>(U)))
5075         return false;
5076   }
5077 
5078   // Epilogue vectorization code has not been auditted to ensure it handles
5079   // non-latch exits properly.  It may be fine, but it needs auditted and
5080   // tested.
5081   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5082     return false;
5083 
5084   return true;
5085 }
5086 
5087 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5088     const ElementCount VF) const {
5089   // FIXME: We need a much better cost-model to take different parameters such
5090   // as register pressure, code size increase and cost of extra branches into
5091   // account. For now we apply a very crude heuristic and only consider loops
5092   // with vectorization factors larger than a certain value.
5093 
5094   // Allow the target to opt out entirely.
5095   if (!TTI.preferEpilogueVectorization())
5096     return false;
5097 
5098   // We also consider epilogue vectorization unprofitable for targets that don't
5099   // consider interleaving beneficial (eg. MVE).
5100   if (TTI.getMaxInterleaveFactor(VF) <= 1)
5101     return false;
5102 
5103   unsigned Multiplier = 1;
5104   if (VF.isScalable())
5105     Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5106   if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5107     return true;
5108   return false;
5109 }
5110 
5111 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5112     const ElementCount MainLoopVF, unsigned IC) {
5113   VectorizationFactor Result = VectorizationFactor::Disabled();
5114   if (!EnableEpilogueVectorization) {
5115     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5116     return Result;
5117   }
5118 
5119   if (!CM.isScalarEpilogueAllowed()) {
5120     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5121                          "epilogue is allowed.\n");
5122     return Result;
5123   }
5124 
5125   // Not really a cost consideration, but check for unsupported cases here to
5126   // simplify the logic.
5127   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5128     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5129                          "is not a supported candidate.\n");
5130     return Result;
5131   }
5132 
5133   if (EpilogueVectorizationForceVF > 1) {
5134     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5135     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5136     if (hasPlanWithVF(ForcedEC))
5137       return {ForcedEC, 0, 0};
5138     else {
5139       LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5140                            "viable.\n");
5141       return Result;
5142     }
5143   }
5144 
5145   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5146       OrigLoop->getHeader()->getParent()->hasMinSize()) {
5147     LLVM_DEBUG(
5148         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5149     return Result;
5150   }
5151 
5152   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5153     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5154                          "this loop\n");
5155     return Result;
5156   }
5157 
5158   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5159   // the main loop handles 8 lanes per iteration. We could still benefit from
5160   // vectorizing the epilogue loop with VF=4.
5161   ElementCount EstimatedRuntimeVF = MainLoopVF;
5162   if (MainLoopVF.isScalable()) {
5163     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5164     if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5165       EstimatedRuntimeVF *= *VScale;
5166   }
5167 
5168   ScalarEvolution &SE = *PSE.getSE();
5169   Type *TCType = Legal->getWidestInductionType();
5170   const SCEV *RemainingIterations = nullptr;
5171   for (auto &NextVF : ProfitableVFs) {
5172     // Skip candidate VFs without a corresponding VPlan.
5173     if (!hasPlanWithVF(NextVF.Width))
5174       continue;
5175 
5176     // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5177     // vectors) or the VF of the main loop (fixed vectors).
5178     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5179          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5180         ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5181       continue;
5182 
5183     // If NextVF is greater than the number of remaining iterations, the
5184     // epilogue loop would be dead. Skip such factors.
5185     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5186       // TODO: extend to support scalable VFs.
5187       if (!RemainingIterations) {
5188         const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5189         RemainingIterations = SE.getURemExpr(
5190             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5191       }
5192       if (SE.isKnownPredicate(
5193               CmpInst::ICMP_UGT,
5194               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5195               RemainingIterations))
5196         continue;
5197     }
5198 
5199     if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5200       Result = NextVF;
5201   }
5202 
5203   if (Result != VectorizationFactor::Disabled())
5204     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5205                       << Result.Width << "\n");
5206   return Result;
5207 }
5208 
5209 std::pair<unsigned, unsigned>
5210 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5211   unsigned MinWidth = -1U;
5212   unsigned MaxWidth = 8;
5213   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5214   // For in-loop reductions, no element types are added to ElementTypesInLoop
5215   // if there are no loads/stores in the loop. In this case, check through the
5216   // reduction variables to determine the maximum width.
5217   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5218     // Reset MaxWidth so that we can find the smallest type used by recurrences
5219     // in the loop.
5220     MaxWidth = -1U;
5221     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5222       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5223       // When finding the min width used by the recurrence we need to account
5224       // for casts on the input operands of the recurrence.
5225       MaxWidth = std::min<unsigned>(
5226           MaxWidth, std::min<unsigned>(
5227                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5228                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5229     }
5230   } else {
5231     for (Type *T : ElementTypesInLoop) {
5232       MinWidth = std::min<unsigned>(
5233           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5234       MaxWidth = std::max<unsigned>(
5235           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5236     }
5237   }
5238   return {MinWidth, MaxWidth};
5239 }
5240 
5241 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5242   ElementTypesInLoop.clear();
5243   // For each block.
5244   for (BasicBlock *BB : TheLoop->blocks()) {
5245     // For each instruction in the loop.
5246     for (Instruction &I : BB->instructionsWithoutDebug()) {
5247       Type *T = I.getType();
5248 
5249       // Skip ignored values.
5250       if (ValuesToIgnore.count(&I))
5251         continue;
5252 
5253       // Only examine Loads, Stores and PHINodes.
5254       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5255         continue;
5256 
5257       // Examine PHI nodes that are reduction variables. Update the type to
5258       // account for the recurrence type.
5259       if (auto *PN = dyn_cast<PHINode>(&I)) {
5260         if (!Legal->isReductionVariable(PN))
5261           continue;
5262         const RecurrenceDescriptor &RdxDesc =
5263             Legal->getReductionVars().find(PN)->second;
5264         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5265             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5266                                       RdxDesc.getRecurrenceType(),
5267                                       TargetTransformInfo::ReductionFlags()))
5268           continue;
5269         T = RdxDesc.getRecurrenceType();
5270       }
5271 
5272       // Examine the stored values.
5273       if (auto *ST = dyn_cast<StoreInst>(&I))
5274         T = ST->getValueOperand()->getType();
5275 
5276       assert(T->isSized() &&
5277              "Expected the load/store/recurrence type to be sized");
5278 
5279       ElementTypesInLoop.insert(T);
5280     }
5281   }
5282 }
5283 
5284 unsigned
5285 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5286                                                   InstructionCost LoopCost) {
5287   // -- The interleave heuristics --
5288   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5289   // There are many micro-architectural considerations that we can't predict
5290   // at this level. For example, frontend pressure (on decode or fetch) due to
5291   // code size, or the number and capabilities of the execution ports.
5292   //
5293   // We use the following heuristics to select the interleave count:
5294   // 1. If the code has reductions, then we interleave to break the cross
5295   // iteration dependency.
5296   // 2. If the loop is really small, then we interleave to reduce the loop
5297   // overhead.
5298   // 3. We don't interleave if we think that we will spill registers to memory
5299   // due to the increased register pressure.
5300 
5301   if (!isScalarEpilogueAllowed())
5302     return 1;
5303 
5304   // We used the distance for the interleave count.
5305   if (!Legal->isSafeForAnyVectorWidth())
5306     return 1;
5307 
5308   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5309   const bool HasReductions = !Legal->getReductionVars().empty();
5310   // Do not interleave loops with a relatively small known or estimated trip
5311   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5312   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5313   // because with the above conditions interleaving can expose ILP and break
5314   // cross iteration dependences for reductions.
5315   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5316       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5317     return 1;
5318 
5319   // If we did not calculate the cost for VF (because the user selected the VF)
5320   // then we calculate the cost of VF here.
5321   if (LoopCost == 0) {
5322     LoopCost = expectedCost(VF).first;
5323     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5324 
5325     // Loop body is free and there is no need for interleaving.
5326     if (LoopCost == 0)
5327       return 1;
5328   }
5329 
5330   RegisterUsage R = calculateRegisterUsage({VF})[0];
5331   // We divide by these constants so assume that we have at least one
5332   // instruction that uses at least one register.
5333   for (auto& pair : R.MaxLocalUsers) {
5334     pair.second = std::max(pair.second, 1U);
5335   }
5336 
5337   // We calculate the interleave count using the following formula.
5338   // Subtract the number of loop invariants from the number of available
5339   // registers. These registers are used by all of the interleaved instances.
5340   // Next, divide the remaining registers by the number of registers that is
5341   // required by the loop, in order to estimate how many parallel instances
5342   // fit without causing spills. All of this is rounded down if necessary to be
5343   // a power of two. We want power of two interleave count to simplify any
5344   // addressing operations or alignment considerations.
5345   // We also want power of two interleave counts to ensure that the induction
5346   // variable of the vector loop wraps to zero, when tail is folded by masking;
5347   // this currently happens when OptForSize, in which case IC is set to 1 above.
5348   unsigned IC = UINT_MAX;
5349 
5350   for (auto& pair : R.MaxLocalUsers) {
5351     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5352     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5353                       << " registers of "
5354                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5355     if (VF.isScalar()) {
5356       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5357         TargetNumRegisters = ForceTargetNumScalarRegs;
5358     } else {
5359       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5360         TargetNumRegisters = ForceTargetNumVectorRegs;
5361     }
5362     unsigned MaxLocalUsers = pair.second;
5363     unsigned LoopInvariantRegs = 0;
5364     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5365       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5366 
5367     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5368                                      MaxLocalUsers);
5369     // Don't count the induction variable as interleaved.
5370     if (EnableIndVarRegisterHeur) {
5371       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5372                               std::max(1U, (MaxLocalUsers - 1)));
5373     }
5374 
5375     IC = std::min(IC, TmpIC);
5376   }
5377 
5378   // Clamp the interleave ranges to reasonable counts.
5379   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5380 
5381   // Check if the user has overridden the max.
5382   if (VF.isScalar()) {
5383     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5384       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5385   } else {
5386     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5387       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5388   }
5389 
5390   unsigned EstimatedVF = VF.getKnownMinValue();
5391   if (VF.isScalable()) {
5392     if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5393       EstimatedVF *= *VScale;
5394   }
5395   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5396 
5397   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5398   if (KnownTC) {
5399     // If trip count is known we select between two prospective ICs, where
5400     // 1) the aggressive IC is capped by the trip count divided by VF
5401     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5402     // The final IC is selected in a way that the epilogue loop trip count is
5403     // minimized while maximizing the IC itself, so that we either run the
5404     // vector loop at least once if it generates a small epilogue loop, or else
5405     // we run the vector loop at least twice.
5406 
5407     unsigned InterleaveCountUB = bit_floor(
5408         std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5409     unsigned InterleaveCountLB = bit_floor(std::max(
5410         1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5411     MaxInterleaveCount = InterleaveCountLB;
5412 
5413     if (InterleaveCountUB != InterleaveCountLB) {
5414       unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5415       unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5416       // If both produce same scalar tail, maximize the IC to do the same work
5417       // in fewer vector loop iterations
5418       if (TailTripCountUB == TailTripCountLB)
5419         MaxInterleaveCount = InterleaveCountUB;
5420     }
5421   } else if (BestKnownTC) {
5422     // If trip count is an estimated compile time constant, limit the
5423     // IC to be capped by the trip count divided by VF * 2, such that the vector
5424     // loop runs at least twice to make interleaving seem profitable when there
5425     // is an epilogue loop present. Since exact Trip count is not known we
5426     // choose to be conservative in our IC estimate.
5427     MaxInterleaveCount = bit_floor(std::max(
5428         1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5429   }
5430 
5431   assert(MaxInterleaveCount > 0 &&
5432          "Maximum interleave count must be greater than 0");
5433 
5434   // Clamp the calculated IC to be between the 1 and the max interleave count
5435   // that the target and trip count allows.
5436   if (IC > MaxInterleaveCount)
5437     IC = MaxInterleaveCount;
5438   else
5439     // Make sure IC is greater than 0.
5440     IC = std::max(1u, IC);
5441 
5442   assert(IC > 0 && "Interleave count must be greater than 0.");
5443 
5444   // Interleave if we vectorized this loop and there is a reduction that could
5445   // benefit from interleaving.
5446   if (VF.isVector() && HasReductions) {
5447     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5448     return IC;
5449   }
5450 
5451   // For any scalar loop that either requires runtime checks or predication we
5452   // are better off leaving this to the unroller. Note that if we've already
5453   // vectorized the loop we will have done the runtime check and so interleaving
5454   // won't require further checks.
5455   bool ScalarInterleavingRequiresPredication =
5456       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5457          return Legal->blockNeedsPredication(BB);
5458        }));
5459   bool ScalarInterleavingRequiresRuntimePointerCheck =
5460       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5461 
5462   // We want to interleave small loops in order to reduce the loop overhead and
5463   // potentially expose ILP opportunities.
5464   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5465                     << "LV: IC is " << IC << '\n'
5466                     << "LV: VF is " << VF << '\n');
5467   const bool AggressivelyInterleaveReductions =
5468       TTI.enableAggressiveInterleaving(HasReductions);
5469   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5470       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5471     // We assume that the cost overhead is 1 and we use the cost model
5472     // to estimate the cost of the loop and interleave until the cost of the
5473     // loop overhead is about 5% of the cost of the loop.
5474     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5475                                         SmallLoopCost / *LoopCost.getValue()));
5476 
5477     // Interleave until store/load ports (estimated by max interleave count) are
5478     // saturated.
5479     unsigned NumStores = Legal->getNumStores();
5480     unsigned NumLoads = Legal->getNumLoads();
5481     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5482     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5483 
5484     // There is little point in interleaving for reductions containing selects
5485     // and compares when VF=1 since it may just create more overhead than it's
5486     // worth for loops with small trip counts. This is because we still have to
5487     // do the final reduction after the loop.
5488     bool HasSelectCmpReductions =
5489         HasReductions &&
5490         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5491           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5492           return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5493               RdxDesc.getRecurrenceKind());
5494         });
5495     if (HasSelectCmpReductions) {
5496       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5497       return 1;
5498     }
5499 
5500     // If we have a scalar reduction (vector reductions are already dealt with
5501     // by this point), we can increase the critical path length if the loop
5502     // we're interleaving is inside another loop. For tree-wise reductions
5503     // set the limit to 2, and for ordered reductions it's best to disable
5504     // interleaving entirely.
5505     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5506       bool HasOrderedReductions =
5507           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5508             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5509             return RdxDesc.isOrdered();
5510           });
5511       if (HasOrderedReductions) {
5512         LLVM_DEBUG(
5513             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5514         return 1;
5515       }
5516 
5517       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5518       SmallIC = std::min(SmallIC, F);
5519       StoresIC = std::min(StoresIC, F);
5520       LoadsIC = std::min(LoadsIC, F);
5521     }
5522 
5523     if (EnableLoadStoreRuntimeInterleave &&
5524         std::max(StoresIC, LoadsIC) > SmallIC) {
5525       LLVM_DEBUG(
5526           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5527       return std::max(StoresIC, LoadsIC);
5528     }
5529 
5530     // If there are scalar reductions and TTI has enabled aggressive
5531     // interleaving for reductions, we will interleave to expose ILP.
5532     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5533         AggressivelyInterleaveReductions) {
5534       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5535       // Interleave no less than SmallIC but not as aggressive as the normal IC
5536       // to satisfy the rare situation when resources are too limited.
5537       return std::max(IC / 2, SmallIC);
5538     } else {
5539       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5540       return SmallIC;
5541     }
5542   }
5543 
5544   // Interleave if this is a large loop (small loops are already dealt with by
5545   // this point) that could benefit from interleaving.
5546   if (AggressivelyInterleaveReductions) {
5547     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5548     return IC;
5549   }
5550 
5551   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5552   return 1;
5553 }
5554 
5555 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5556 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5557   // This function calculates the register usage by measuring the highest number
5558   // of values that are alive at a single location. Obviously, this is a very
5559   // rough estimation. We scan the loop in a topological order in order and
5560   // assign a number to each instruction. We use RPO to ensure that defs are
5561   // met before their users. We assume that each instruction that has in-loop
5562   // users starts an interval. We record every time that an in-loop value is
5563   // used, so we have a list of the first and last occurrences of each
5564   // instruction. Next, we transpose this data structure into a multi map that
5565   // holds the list of intervals that *end* at a specific location. This multi
5566   // map allows us to perform a linear search. We scan the instructions linearly
5567   // and record each time that a new interval starts, by placing it in a set.
5568   // If we find this value in the multi-map then we remove it from the set.
5569   // The max register usage is the maximum size of the set.
5570   // We also search for instructions that are defined outside the loop, but are
5571   // used inside the loop. We need this number separately from the max-interval
5572   // usage number because when we unroll, loop-invariant values do not take
5573   // more register.
5574   LoopBlocksDFS DFS(TheLoop);
5575   DFS.perform(LI);
5576 
5577   RegisterUsage RU;
5578 
5579   // Each 'key' in the map opens a new interval. The values
5580   // of the map are the index of the 'last seen' usage of the
5581   // instruction that is the key.
5582   using IntervalMap = DenseMap<Instruction *, unsigned>;
5583 
5584   // Maps instruction to its index.
5585   SmallVector<Instruction *, 64> IdxToInstr;
5586   // Marks the end of each interval.
5587   IntervalMap EndPoint;
5588   // Saves the list of instruction indices that are used in the loop.
5589   SmallPtrSet<Instruction *, 8> Ends;
5590   // Saves the list of values that are used in the loop but are defined outside
5591   // the loop (not including non-instruction values such as arguments and
5592   // constants).
5593   SmallSetVector<Instruction *, 8> LoopInvariants;
5594 
5595   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5596     for (Instruction &I : BB->instructionsWithoutDebug()) {
5597       IdxToInstr.push_back(&I);
5598 
5599       // Save the end location of each USE.
5600       for (Value *U : I.operands()) {
5601         auto *Instr = dyn_cast<Instruction>(U);
5602 
5603         // Ignore non-instruction values such as arguments, constants, etc.
5604         // FIXME: Might need some motivation why these values are ignored. If
5605         // for example an argument is used inside the loop it will increase the
5606         // register pressure (so shouldn't we add it to LoopInvariants).
5607         if (!Instr)
5608           continue;
5609 
5610         // If this instruction is outside the loop then record it and continue.
5611         if (!TheLoop->contains(Instr)) {
5612           LoopInvariants.insert(Instr);
5613           continue;
5614         }
5615 
5616         // Overwrite previous end points.
5617         EndPoint[Instr] = IdxToInstr.size();
5618         Ends.insert(Instr);
5619       }
5620     }
5621   }
5622 
5623   // Saves the list of intervals that end with the index in 'key'.
5624   using InstrList = SmallVector<Instruction *, 2>;
5625   DenseMap<unsigned, InstrList> TransposeEnds;
5626 
5627   // Transpose the EndPoints to a list of values that end at each index.
5628   for (auto &Interval : EndPoint)
5629     TransposeEnds[Interval.second].push_back(Interval.first);
5630 
5631   SmallPtrSet<Instruction *, 8> OpenIntervals;
5632   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5633   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5634 
5635   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5636 
5637   const auto &TTICapture = TTI;
5638   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5639     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5640       return 0;
5641     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5642   };
5643 
5644   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5645     Instruction *I = IdxToInstr[i];
5646 
5647     // Remove all of the instructions that end at this location.
5648     InstrList &List = TransposeEnds[i];
5649     for (Instruction *ToRemove : List)
5650       OpenIntervals.erase(ToRemove);
5651 
5652     // Ignore instructions that are never used within the loop.
5653     if (!Ends.count(I))
5654       continue;
5655 
5656     // Skip ignored values.
5657     if (ValuesToIgnore.count(I))
5658       continue;
5659 
5660     collectInLoopReductions();
5661 
5662     // For each VF find the maximum usage of registers.
5663     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5664       // Count the number of registers used, per register class, given all open
5665       // intervals.
5666       // Note that elements in this SmallMapVector will be default constructed
5667       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5668       // there is no previous entry for ClassID.
5669       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5670 
5671       if (VFs[j].isScalar()) {
5672         for (auto *Inst : OpenIntervals) {
5673           unsigned ClassID =
5674               TTI.getRegisterClassForType(false, Inst->getType());
5675           // FIXME: The target might use more than one register for the type
5676           // even in the scalar case.
5677           RegUsage[ClassID] += 1;
5678         }
5679       } else {
5680         collectUniformsAndScalars(VFs[j]);
5681         for (auto *Inst : OpenIntervals) {
5682           // Skip ignored values for VF > 1.
5683           if (VecValuesToIgnore.count(Inst))
5684             continue;
5685           if (isScalarAfterVectorization(Inst, VFs[j])) {
5686             unsigned ClassID =
5687                 TTI.getRegisterClassForType(false, Inst->getType());
5688             // FIXME: The target might use more than one register for the type
5689             // even in the scalar case.
5690             RegUsage[ClassID] += 1;
5691           } else {
5692             unsigned ClassID =
5693                 TTI.getRegisterClassForType(true, Inst->getType());
5694             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5695           }
5696         }
5697       }
5698 
5699       for (auto& pair : RegUsage) {
5700         auto &Entry = MaxUsages[j][pair.first];
5701         Entry = std::max(Entry, pair.second);
5702       }
5703     }
5704 
5705     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5706                       << OpenIntervals.size() << '\n');
5707 
5708     // Add the current instruction to the list of open intervals.
5709     OpenIntervals.insert(I);
5710   }
5711 
5712   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5713     // Note that elements in this SmallMapVector will be default constructed
5714     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5715     // there is no previous entry for ClassID.
5716     SmallMapVector<unsigned, unsigned, 4> Invariant;
5717 
5718     for (auto *Inst : LoopInvariants) {
5719       // FIXME: The target might use more than one register for the type
5720       // even in the scalar case.
5721       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5722         auto *I = cast<Instruction>(U);
5723         return TheLoop != LI->getLoopFor(I->getParent()) ||
5724                isScalarAfterVectorization(I, VFs[i]);
5725       });
5726 
5727       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5728       unsigned ClassID =
5729           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5730       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5731     }
5732 
5733     LLVM_DEBUG({
5734       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5735       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5736              << " item\n";
5737       for (const auto &pair : MaxUsages[i]) {
5738         dbgs() << "LV(REG): RegisterClass: "
5739                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5740                << " registers\n";
5741       }
5742       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5743              << " item\n";
5744       for (const auto &pair : Invariant) {
5745         dbgs() << "LV(REG): RegisterClass: "
5746                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5747                << " registers\n";
5748       }
5749     });
5750 
5751     RU.LoopInvariantRegs = Invariant;
5752     RU.MaxLocalUsers = MaxUsages[i];
5753     RUs[i] = RU;
5754   }
5755 
5756   return RUs;
5757 }
5758 
5759 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5760                                                            ElementCount VF) {
5761   // TODO: Cost model for emulated masked load/store is completely
5762   // broken. This hack guides the cost model to use an artificially
5763   // high enough value to practically disable vectorization with such
5764   // operations, except where previously deployed legality hack allowed
5765   // using very low cost values. This is to avoid regressions coming simply
5766   // from moving "masked load/store" check from legality to cost model.
5767   // Masked Load/Gather emulation was previously never allowed.
5768   // Limited number of Masked Store/Scatter emulation was allowed.
5769   assert((isPredicatedInst(I)) &&
5770          "Expecting a scalar emulated instruction");
5771   return isa<LoadInst>(I) ||
5772          (isa<StoreInst>(I) &&
5773           NumPredStores > NumberOfStoresToPredicate);
5774 }
5775 
5776 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5777   // If we aren't vectorizing the loop, or if we've already collected the
5778   // instructions to scalarize, there's nothing to do. Collection may already
5779   // have occurred if we have a user-selected VF and are now computing the
5780   // expected cost for interleaving.
5781   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5782     return;
5783 
5784   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5785   // not profitable to scalarize any instructions, the presence of VF in the
5786   // map will indicate that we've analyzed it already.
5787   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5788 
5789   PredicatedBBsAfterVectorization[VF].clear();
5790 
5791   // Find all the instructions that are scalar with predication in the loop and
5792   // determine if it would be better to not if-convert the blocks they are in.
5793   // If so, we also record the instructions to scalarize.
5794   for (BasicBlock *BB : TheLoop->blocks()) {
5795     if (!blockNeedsPredicationForAnyReason(BB))
5796       continue;
5797     for (Instruction &I : *BB)
5798       if (isScalarWithPredication(&I, VF)) {
5799         ScalarCostsTy ScalarCosts;
5800         // Do not apply discount if scalable, because that would lead to
5801         // invalid scalarization costs.
5802         // Do not apply discount logic if hacked cost is needed
5803         // for emulated masked memrefs.
5804         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
5805             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5806           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5807         // Remember that BB will remain after vectorization.
5808         PredicatedBBsAfterVectorization[VF].insert(BB);
5809       }
5810   }
5811 }
5812 
5813 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5814     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5815   assert(!isUniformAfterVectorization(PredInst, VF) &&
5816          "Instruction marked uniform-after-vectorization will be predicated");
5817 
5818   // Initialize the discount to zero, meaning that the scalar version and the
5819   // vector version cost the same.
5820   InstructionCost Discount = 0;
5821 
5822   // Holds instructions to analyze. The instructions we visit are mapped in
5823   // ScalarCosts. Those instructions are the ones that would be scalarized if
5824   // we find that the scalar version costs less.
5825   SmallVector<Instruction *, 8> Worklist;
5826 
5827   // Returns true if the given instruction can be scalarized.
5828   auto canBeScalarized = [&](Instruction *I) -> bool {
5829     // We only attempt to scalarize instructions forming a single-use chain
5830     // from the original predicated block that would otherwise be vectorized.
5831     // Although not strictly necessary, we give up on instructions we know will
5832     // already be scalar to avoid traversing chains that are unlikely to be
5833     // beneficial.
5834     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5835         isScalarAfterVectorization(I, VF))
5836       return false;
5837 
5838     // If the instruction is scalar with predication, it will be analyzed
5839     // separately. We ignore it within the context of PredInst.
5840     if (isScalarWithPredication(I, VF))
5841       return false;
5842 
5843     // If any of the instruction's operands are uniform after vectorization,
5844     // the instruction cannot be scalarized. This prevents, for example, a
5845     // masked load from being scalarized.
5846     //
5847     // We assume we will only emit a value for lane zero of an instruction
5848     // marked uniform after vectorization, rather than VF identical values.
5849     // Thus, if we scalarize an instruction that uses a uniform, we would
5850     // create uses of values corresponding to the lanes we aren't emitting code
5851     // for. This behavior can be changed by allowing getScalarValue to clone
5852     // the lane zero values for uniforms rather than asserting.
5853     for (Use &U : I->operands())
5854       if (auto *J = dyn_cast<Instruction>(U.get()))
5855         if (isUniformAfterVectorization(J, VF))
5856           return false;
5857 
5858     // Otherwise, we can scalarize the instruction.
5859     return true;
5860   };
5861 
5862   // Compute the expected cost discount from scalarizing the entire expression
5863   // feeding the predicated instruction. We currently only consider expressions
5864   // that are single-use instruction chains.
5865   Worklist.push_back(PredInst);
5866   while (!Worklist.empty()) {
5867     Instruction *I = Worklist.pop_back_val();
5868 
5869     // If we've already analyzed the instruction, there's nothing to do.
5870     if (ScalarCosts.contains(I))
5871       continue;
5872 
5873     // Compute the cost of the vector instruction. Note that this cost already
5874     // includes the scalarization overhead of the predicated instruction.
5875     InstructionCost VectorCost = getInstructionCost(I, VF).first;
5876 
5877     // Compute the cost of the scalarized instruction. This cost is the cost of
5878     // the instruction as if it wasn't if-converted and instead remained in the
5879     // predicated block. We will scale this cost by block probability after
5880     // computing the scalarization overhead.
5881     InstructionCost ScalarCost =
5882         VF.getFixedValue() *
5883         getInstructionCost(I, ElementCount::getFixed(1)).first;
5884 
5885     // Compute the scalarization overhead of needed insertelement instructions
5886     // and phi nodes.
5887     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5888     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5889       ScalarCost += TTI.getScalarizationOverhead(
5890           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5891           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5892           /*Extract*/ false, CostKind);
5893       ScalarCost +=
5894           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5895     }
5896 
5897     // Compute the scalarization overhead of needed extractelement
5898     // instructions. For each of the instruction's operands, if the operand can
5899     // be scalarized, add it to the worklist; otherwise, account for the
5900     // overhead.
5901     for (Use &U : I->operands())
5902       if (auto *J = dyn_cast<Instruction>(U.get())) {
5903         assert(VectorType::isValidElementType(J->getType()) &&
5904                "Instruction has non-scalar type");
5905         if (canBeScalarized(J))
5906           Worklist.push_back(J);
5907         else if (needsExtract(J, VF)) {
5908           ScalarCost += TTI.getScalarizationOverhead(
5909               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5910               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5911               /*Extract*/ true, CostKind);
5912         }
5913       }
5914 
5915     // Scale the total scalar cost by block probability.
5916     ScalarCost /= getReciprocalPredBlockProb();
5917 
5918     // Compute the discount. A non-negative discount means the vector version
5919     // of the instruction costs more, and scalarizing would be beneficial.
5920     Discount += VectorCost - ScalarCost;
5921     ScalarCosts[I] = ScalarCost;
5922   }
5923 
5924   return Discount;
5925 }
5926 
5927 LoopVectorizationCostModel::VectorizationCostTy
5928 LoopVectorizationCostModel::expectedCost(
5929     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5930   VectorizationCostTy Cost;
5931 
5932   // For each block.
5933   for (BasicBlock *BB : TheLoop->blocks()) {
5934     VectorizationCostTy BlockCost;
5935 
5936     // For each instruction in the old loop.
5937     for (Instruction &I : BB->instructionsWithoutDebug()) {
5938       // Skip ignored values.
5939       if (ValuesToIgnore.count(&I) ||
5940           (VF.isVector() && VecValuesToIgnore.count(&I)))
5941         continue;
5942 
5943       VectorizationCostTy C = getInstructionCost(&I, VF);
5944 
5945       // Check if we should override the cost.
5946       if (C.first.isValid() &&
5947           ForceTargetInstructionCost.getNumOccurrences() > 0)
5948         C.first = InstructionCost(ForceTargetInstructionCost);
5949 
5950       // Keep a list of instructions with invalid costs.
5951       if (Invalid && !C.first.isValid())
5952         Invalid->emplace_back(&I, VF);
5953 
5954       BlockCost.first += C.first;
5955       BlockCost.second |= C.second;
5956       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5957                         << " for VF " << VF << " For instruction: " << I
5958                         << '\n');
5959     }
5960 
5961     // If we are vectorizing a predicated block, it will have been
5962     // if-converted. This means that the block's instructions (aside from
5963     // stores and instructions that may divide by zero) will now be
5964     // unconditionally executed. For the scalar case, we may not always execute
5965     // the predicated block, if it is an if-else block. Thus, scale the block's
5966     // cost by the probability of executing it. blockNeedsPredication from
5967     // Legal is used so as to not include all blocks in tail folded loops.
5968     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5969       BlockCost.first /= getReciprocalPredBlockProb();
5970 
5971     Cost.first += BlockCost.first;
5972     Cost.second |= BlockCost.second;
5973   }
5974 
5975   return Cost;
5976 }
5977 
5978 /// Gets Address Access SCEV after verifying that the access pattern
5979 /// is loop invariant except the induction variable dependence.
5980 ///
5981 /// This SCEV can be sent to the Target in order to estimate the address
5982 /// calculation cost.
5983 static const SCEV *getAddressAccessSCEV(
5984               Value *Ptr,
5985               LoopVectorizationLegality *Legal,
5986               PredicatedScalarEvolution &PSE,
5987               const Loop *TheLoop) {
5988 
5989   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5990   if (!Gep)
5991     return nullptr;
5992 
5993   // We are looking for a gep with all loop invariant indices except for one
5994   // which should be an induction variable.
5995   auto SE = PSE.getSE();
5996   unsigned NumOperands = Gep->getNumOperands();
5997   for (unsigned i = 1; i < NumOperands; ++i) {
5998     Value *Opd = Gep->getOperand(i);
5999     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6000         !Legal->isInductionVariable(Opd))
6001       return nullptr;
6002   }
6003 
6004   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6005   return PSE.getSCEV(Ptr);
6006 }
6007 
6008 InstructionCost
6009 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6010                                                         ElementCount VF) {
6011   assert(VF.isVector() &&
6012          "Scalarization cost of instruction implies vectorization.");
6013   if (VF.isScalable())
6014     return InstructionCost::getInvalid();
6015 
6016   Type *ValTy = getLoadStoreType(I);
6017   auto SE = PSE.getSE();
6018 
6019   unsigned AS = getLoadStoreAddressSpace(I);
6020   Value *Ptr = getLoadStorePointerOperand(I);
6021   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6022   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6023   //       that it is being called from this specific place.
6024 
6025   // Figure out whether the access is strided and get the stride value
6026   // if it's known in compile time
6027   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6028 
6029   // Get the cost of the scalar memory instruction and address computation.
6030   InstructionCost Cost =
6031       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6032 
6033   // Don't pass *I here, since it is scalar but will actually be part of a
6034   // vectorized loop where the user of it is a vectorized instruction.
6035   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6036   const Align Alignment = getLoadStoreAlignment(I);
6037   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6038                                                       ValTy->getScalarType(),
6039                                                       Alignment, AS, CostKind);
6040 
6041   // Get the overhead of the extractelement and insertelement instructions
6042   // we might create due to scalarization.
6043   Cost += getScalarizationOverhead(I, VF, CostKind);
6044 
6045   // If we have a predicated load/store, it will need extra i1 extracts and
6046   // conditional branches, but may not be executed for each vector lane. Scale
6047   // the cost by the probability of executing the predicated block.
6048   if (isPredicatedInst(I)) {
6049     Cost /= getReciprocalPredBlockProb();
6050 
6051     // Add the cost of an i1 extract and a branch
6052     auto *Vec_i1Ty =
6053         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6054     Cost += TTI.getScalarizationOverhead(
6055         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6056         /*Insert=*/false, /*Extract=*/true, CostKind);
6057     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6058 
6059     if (useEmulatedMaskMemRefHack(I, VF))
6060       // Artificially setting to a high enough value to practically disable
6061       // vectorization with such operations.
6062       Cost = 3000000;
6063   }
6064 
6065   return Cost;
6066 }
6067 
6068 InstructionCost
6069 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6070                                                     ElementCount VF) {
6071   Type *ValTy = getLoadStoreType(I);
6072   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6073   Value *Ptr = getLoadStorePointerOperand(I);
6074   unsigned AS = getLoadStoreAddressSpace(I);
6075   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6076   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6077 
6078   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6079          "Stride should be 1 or -1 for consecutive memory access");
6080   const Align Alignment = getLoadStoreAlignment(I);
6081   InstructionCost Cost = 0;
6082   if (Legal->isMaskRequired(I)) {
6083     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6084                                       CostKind);
6085   } else {
6086     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6087     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6088                                 CostKind, OpInfo, I);
6089   }
6090 
6091   bool Reverse = ConsecutiveStride < 0;
6092   if (Reverse)
6093     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6094                                std::nullopt, CostKind, 0);
6095   return Cost;
6096 }
6097 
6098 InstructionCost
6099 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6100                                                 ElementCount VF) {
6101   assert(Legal->isUniformMemOp(*I, VF));
6102 
6103   Type *ValTy = getLoadStoreType(I);
6104   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6105   const Align Alignment = getLoadStoreAlignment(I);
6106   unsigned AS = getLoadStoreAddressSpace(I);
6107   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6108   if (isa<LoadInst>(I)) {
6109     return TTI.getAddressComputationCost(ValTy) +
6110            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6111                                CostKind) +
6112            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6113   }
6114   StoreInst *SI = cast<StoreInst>(I);
6115 
6116   bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6117   return TTI.getAddressComputationCost(ValTy) +
6118          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6119                              CostKind) +
6120          (isLoopInvariantStoreValue
6121               ? 0
6122               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6123                                        CostKind, VF.getKnownMinValue() - 1));
6124 }
6125 
6126 InstructionCost
6127 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6128                                                  ElementCount VF) {
6129   Type *ValTy = getLoadStoreType(I);
6130   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6131   const Align Alignment = getLoadStoreAlignment(I);
6132   const Value *Ptr = getLoadStorePointerOperand(I);
6133 
6134   return TTI.getAddressComputationCost(VectorTy) +
6135          TTI.getGatherScatterOpCost(
6136              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6137              TargetTransformInfo::TCK_RecipThroughput, I);
6138 }
6139 
6140 InstructionCost
6141 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6142                                                    ElementCount VF) {
6143   Type *ValTy = getLoadStoreType(I);
6144   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6145   unsigned AS = getLoadStoreAddressSpace(I);
6146   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6147 
6148   auto Group = getInterleavedAccessGroup(I);
6149   assert(Group && "Fail to get an interleaved access group.");
6150 
6151   unsigned InterleaveFactor = Group->getFactor();
6152   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6153 
6154   // Holds the indices of existing members in the interleaved group.
6155   SmallVector<unsigned, 4> Indices;
6156   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6157     if (Group->getMember(IF))
6158       Indices.push_back(IF);
6159 
6160   // Calculate the cost of the whole interleaved group.
6161   bool UseMaskForGaps =
6162       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6163       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6164   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6165       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6166       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6167 
6168   if (Group->isReverse()) {
6169     // TODO: Add support for reversed masked interleaved access.
6170     assert(!Legal->isMaskRequired(I) &&
6171            "Reverse masked interleaved access not supported.");
6172     Cost += Group->getNumMembers() *
6173             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6174                                std::nullopt, CostKind, 0);
6175   }
6176   return Cost;
6177 }
6178 
6179 std::optional<InstructionCost>
6180 LoopVectorizationCostModel::getReductionPatternCost(
6181     Instruction *I, ElementCount VF, Type *Ty,
6182     TTI::TargetCostKind CostKind) const {
6183   using namespace llvm::PatternMatch;
6184   // Early exit for no inloop reductions
6185   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6186     return std::nullopt;
6187   auto *VectorTy = cast<VectorType>(Ty);
6188 
6189   // We are looking for a pattern of, and finding the minimal acceptable cost:
6190   //  reduce(mul(ext(A), ext(B))) or
6191   //  reduce(mul(A, B)) or
6192   //  reduce(ext(A)) or
6193   //  reduce(A).
6194   // The basic idea is that we walk down the tree to do that, finding the root
6195   // reduction instruction in InLoopReductionImmediateChains. From there we find
6196   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6197   // of the components. If the reduction cost is lower then we return it for the
6198   // reduction instruction and 0 for the other instructions in the pattern. If
6199   // it is not we return an invalid cost specifying the orignal cost method
6200   // should be used.
6201   Instruction *RetI = I;
6202   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6203     if (!RetI->hasOneUser())
6204       return std::nullopt;
6205     RetI = RetI->user_back();
6206   }
6207 
6208   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6209       RetI->user_back()->getOpcode() == Instruction::Add) {
6210     RetI = RetI->user_back();
6211   }
6212 
6213   // Test if the found instruction is a reduction, and if not return an invalid
6214   // cost specifying the parent to use the original cost modelling.
6215   if (!InLoopReductionImmediateChains.count(RetI))
6216     return std::nullopt;
6217 
6218   // Find the reduction this chain is a part of and calculate the basic cost of
6219   // the reduction on its own.
6220   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6221   Instruction *ReductionPhi = LastChain;
6222   while (!isa<PHINode>(ReductionPhi))
6223     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6224 
6225   const RecurrenceDescriptor &RdxDesc =
6226       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6227 
6228   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6229       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6230 
6231   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6232   // normal fmul instruction to the cost of the fadd reduction.
6233   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6234     BaseCost +=
6235         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6236 
6237   // If we're using ordered reductions then we can just return the base cost
6238   // here, since getArithmeticReductionCost calculates the full ordered
6239   // reduction cost when FP reassociation is not allowed.
6240   if (useOrderedReductions(RdxDesc))
6241     return BaseCost;
6242 
6243   // Get the operand that was not the reduction chain and match it to one of the
6244   // patterns, returning the better cost if it is found.
6245   Instruction *RedOp = RetI->getOperand(1) == LastChain
6246                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6247                            : dyn_cast<Instruction>(RetI->getOperand(1));
6248 
6249   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6250 
6251   Instruction *Op0, *Op1;
6252   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6253       match(RedOp,
6254             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6255       match(Op0, m_ZExtOrSExt(m_Value())) &&
6256       Op0->getOpcode() == Op1->getOpcode() &&
6257       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6258       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6259       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6260 
6261     // Matched reduce.add(ext(mul(ext(A), ext(B)))
6262     // Note that the extend opcodes need to all match, or if A==B they will have
6263     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6264     // which is equally fine.
6265     bool IsUnsigned = isa<ZExtInst>(Op0);
6266     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6267     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6268 
6269     InstructionCost ExtCost =
6270         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6271                              TTI::CastContextHint::None, CostKind, Op0);
6272     InstructionCost MulCost =
6273         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6274     InstructionCost Ext2Cost =
6275         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6276                              TTI::CastContextHint::None, CostKind, RedOp);
6277 
6278     InstructionCost RedCost = TTI.getMulAccReductionCost(
6279         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6280 
6281     if (RedCost.isValid() &&
6282         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6283       return I == RetI ? RedCost : 0;
6284   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6285              !TheLoop->isLoopInvariant(RedOp)) {
6286     // Matched reduce(ext(A))
6287     bool IsUnsigned = isa<ZExtInst>(RedOp);
6288     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6289     InstructionCost RedCost = TTI.getExtendedReductionCost(
6290         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6291         RdxDesc.getFastMathFlags(), CostKind);
6292 
6293     InstructionCost ExtCost =
6294         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6295                              TTI::CastContextHint::None, CostKind, RedOp);
6296     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6297       return I == RetI ? RedCost : 0;
6298   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6299              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6300     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6301         Op0->getOpcode() == Op1->getOpcode() &&
6302         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6303       bool IsUnsigned = isa<ZExtInst>(Op0);
6304       Type *Op0Ty = Op0->getOperand(0)->getType();
6305       Type *Op1Ty = Op1->getOperand(0)->getType();
6306       Type *LargestOpTy =
6307           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6308                                                                     : Op0Ty;
6309       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6310 
6311       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6312       // different sizes. We take the largest type as the ext to reduce, and add
6313       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6314       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6315           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6316           TTI::CastContextHint::None, CostKind, Op0);
6317       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6318           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6319           TTI::CastContextHint::None, CostKind, Op1);
6320       InstructionCost MulCost =
6321           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6322 
6323       InstructionCost RedCost = TTI.getMulAccReductionCost(
6324           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6325       InstructionCost ExtraExtCost = 0;
6326       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6327         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6328         ExtraExtCost = TTI.getCastInstrCost(
6329             ExtraExtOp->getOpcode(), ExtType,
6330             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6331             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6332       }
6333 
6334       if (RedCost.isValid() &&
6335           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6336         return I == RetI ? RedCost : 0;
6337     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6338       // Matched reduce.add(mul())
6339       InstructionCost MulCost =
6340           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6341 
6342       InstructionCost RedCost = TTI.getMulAccReductionCost(
6343           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6344 
6345       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6346         return I == RetI ? RedCost : 0;
6347     }
6348   }
6349 
6350   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6351 }
6352 
6353 InstructionCost
6354 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6355                                                      ElementCount VF) {
6356   // Calculate scalar cost only. Vectorization cost should be ready at this
6357   // moment.
6358   if (VF.isScalar()) {
6359     Type *ValTy = getLoadStoreType(I);
6360     const Align Alignment = getLoadStoreAlignment(I);
6361     unsigned AS = getLoadStoreAddressSpace(I);
6362 
6363     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6364     return TTI.getAddressComputationCost(ValTy) +
6365            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6366                                TTI::TCK_RecipThroughput, OpInfo, I);
6367   }
6368   return getWideningCost(I, VF);
6369 }
6370 
6371 LoopVectorizationCostModel::VectorizationCostTy
6372 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6373                                                ElementCount VF) {
6374   // If we know that this instruction will remain uniform, check the cost of
6375   // the scalar version.
6376   if (isUniformAfterVectorization(I, VF))
6377     VF = ElementCount::getFixed(1);
6378 
6379   if (VF.isVector() && isProfitableToScalarize(I, VF))
6380     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6381 
6382   // Forced scalars do not have any scalarization overhead.
6383   auto ForcedScalar = ForcedScalars.find(VF);
6384   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6385     auto InstSet = ForcedScalar->second;
6386     if (InstSet.count(I))
6387       return VectorizationCostTy(
6388           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6389            VF.getKnownMinValue()),
6390           false);
6391   }
6392 
6393   Type *VectorTy;
6394   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6395 
6396   bool TypeNotScalarized = false;
6397   if (VF.isVector() && VectorTy->isVectorTy()) {
6398     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6399       if (VF.isScalable())
6400         // <vscale x 1 x iN> is assumed to be profitable over iN because
6401         // scalable registers are a distinct register class from scalar ones.
6402         // If we ever find a target which wants to lower scalable vectors
6403         // back to scalars, we'll need to update this code to explicitly
6404         // ask TTI about the register class uses for each part.
6405         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6406       else
6407         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6408     } else
6409       C = InstructionCost::getInvalid();
6410   }
6411   return VectorizationCostTy(C, TypeNotScalarized);
6412 }
6413 
6414 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6415     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6416 
6417   // There is no mechanism yet to create a scalable scalarization loop,
6418   // so this is currently Invalid.
6419   if (VF.isScalable())
6420     return InstructionCost::getInvalid();
6421 
6422   if (VF.isScalar())
6423     return 0;
6424 
6425   InstructionCost Cost = 0;
6426   Type *RetTy = ToVectorTy(I->getType(), VF);
6427   if (!RetTy->isVoidTy() &&
6428       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6429     Cost += TTI.getScalarizationOverhead(
6430         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6431         /*Insert*/ true,
6432         /*Extract*/ false, CostKind);
6433 
6434   // Some targets keep addresses scalar.
6435   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6436     return Cost;
6437 
6438   // Some targets support efficient element stores.
6439   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6440     return Cost;
6441 
6442   // Collect operands to consider.
6443   CallInst *CI = dyn_cast<CallInst>(I);
6444   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6445 
6446   // Skip operands that do not require extraction/scalarization and do not incur
6447   // any overhead.
6448   SmallVector<Type *> Tys;
6449   for (auto *V : filterExtractingOperands(Ops, VF))
6450     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6451   return Cost + TTI.getOperandsScalarizationOverhead(
6452                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6453 }
6454 
6455 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6456   if (VF.isScalar())
6457     return;
6458   NumPredStores = 0;
6459   for (BasicBlock *BB : TheLoop->blocks()) {
6460     // For each instruction in the old loop.
6461     for (Instruction &I : *BB) {
6462       Value *Ptr =  getLoadStorePointerOperand(&I);
6463       if (!Ptr)
6464         continue;
6465 
6466       // TODO: We should generate better code and update the cost model for
6467       // predicated uniform stores. Today they are treated as any other
6468       // predicated store (see added test cases in
6469       // invariant-store-vectorization.ll).
6470       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6471         NumPredStores++;
6472 
6473       if (Legal->isUniformMemOp(I, VF)) {
6474         auto isLegalToScalarize = [&]() {
6475           if (!VF.isScalable())
6476             // Scalarization of fixed length vectors "just works".
6477             return true;
6478 
6479           // We have dedicated lowering for unpredicated uniform loads and
6480           // stores.  Note that even with tail folding we know that at least
6481           // one lane is active (i.e. generalized predication is not possible
6482           // here), and the logic below depends on this fact.
6483           if (!foldTailByMasking())
6484             return true;
6485 
6486           // For scalable vectors, a uniform memop load is always
6487           // uniform-by-parts  and we know how to scalarize that.
6488           if (isa<LoadInst>(I))
6489             return true;
6490 
6491           // A uniform store isn't neccessarily uniform-by-part
6492           // and we can't assume scalarization.
6493           auto &SI = cast<StoreInst>(I);
6494           return TheLoop->isLoopInvariant(SI.getValueOperand());
6495         };
6496 
6497         const InstructionCost GatherScatterCost =
6498           isLegalGatherOrScatter(&I, VF) ?
6499           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6500 
6501         // Load: Scalar load + broadcast
6502         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6503         // FIXME: This cost is a significant under-estimate for tail folded
6504         // memory ops.
6505         const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6506           getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6507 
6508         // Choose better solution for the current VF,  Note that Invalid
6509         // costs compare as maximumal large.  If both are invalid, we get
6510         // scalable invalid which signals a failure and a vectorization abort.
6511         if (GatherScatterCost < ScalarizationCost)
6512           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6513         else
6514           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6515         continue;
6516       }
6517 
6518       // We assume that widening is the best solution when possible.
6519       if (memoryInstructionCanBeWidened(&I, VF)) {
6520         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6521         int ConsecutiveStride = Legal->isConsecutivePtr(
6522             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6523         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6524                "Expected consecutive stride.");
6525         InstWidening Decision =
6526             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6527         setWideningDecision(&I, VF, Decision, Cost);
6528         continue;
6529       }
6530 
6531       // Choose between Interleaving, Gather/Scatter or Scalarization.
6532       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6533       unsigned NumAccesses = 1;
6534       if (isAccessInterleaved(&I)) {
6535         auto Group = getInterleavedAccessGroup(&I);
6536         assert(Group && "Fail to get an interleaved access group.");
6537 
6538         // Make one decision for the whole group.
6539         if (getWideningDecision(&I, VF) != CM_Unknown)
6540           continue;
6541 
6542         NumAccesses = Group->getNumMembers();
6543         if (interleavedAccessCanBeWidened(&I, VF))
6544           InterleaveCost = getInterleaveGroupCost(&I, VF);
6545       }
6546 
6547       InstructionCost GatherScatterCost =
6548           isLegalGatherOrScatter(&I, VF)
6549               ? getGatherScatterCost(&I, VF) * NumAccesses
6550               : InstructionCost::getInvalid();
6551 
6552       InstructionCost ScalarizationCost =
6553           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6554 
6555       // Choose better solution for the current VF,
6556       // write down this decision and use it during vectorization.
6557       InstructionCost Cost;
6558       InstWidening Decision;
6559       if (InterleaveCost <= GatherScatterCost &&
6560           InterleaveCost < ScalarizationCost) {
6561         Decision = CM_Interleave;
6562         Cost = InterleaveCost;
6563       } else if (GatherScatterCost < ScalarizationCost) {
6564         Decision = CM_GatherScatter;
6565         Cost = GatherScatterCost;
6566       } else {
6567         Decision = CM_Scalarize;
6568         Cost = ScalarizationCost;
6569       }
6570       // If the instructions belongs to an interleave group, the whole group
6571       // receives the same decision. The whole group receives the cost, but
6572       // the cost will actually be assigned to one instruction.
6573       if (auto Group = getInterleavedAccessGroup(&I))
6574         setWideningDecision(Group, VF, Decision, Cost);
6575       else
6576         setWideningDecision(&I, VF, Decision, Cost);
6577     }
6578   }
6579 
6580   // Make sure that any load of address and any other address computation
6581   // remains scalar unless there is gather/scatter support. This avoids
6582   // inevitable extracts into address registers, and also has the benefit of
6583   // activating LSR more, since that pass can't optimize vectorized
6584   // addresses.
6585   if (TTI.prefersVectorizedAddressing())
6586     return;
6587 
6588   // Start with all scalar pointer uses.
6589   SmallPtrSet<Instruction *, 8> AddrDefs;
6590   for (BasicBlock *BB : TheLoop->blocks())
6591     for (Instruction &I : *BB) {
6592       Instruction *PtrDef =
6593         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6594       if (PtrDef && TheLoop->contains(PtrDef) &&
6595           getWideningDecision(&I, VF) != CM_GatherScatter)
6596         AddrDefs.insert(PtrDef);
6597     }
6598 
6599   // Add all instructions used to generate the addresses.
6600   SmallVector<Instruction *, 4> Worklist;
6601   append_range(Worklist, AddrDefs);
6602   while (!Worklist.empty()) {
6603     Instruction *I = Worklist.pop_back_val();
6604     for (auto &Op : I->operands())
6605       if (auto *InstOp = dyn_cast<Instruction>(Op))
6606         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6607             AddrDefs.insert(InstOp).second)
6608           Worklist.push_back(InstOp);
6609   }
6610 
6611   for (auto *I : AddrDefs) {
6612     if (isa<LoadInst>(I)) {
6613       // Setting the desired widening decision should ideally be handled in
6614       // by cost functions, but since this involves the task of finding out
6615       // if the loaded register is involved in an address computation, it is
6616       // instead changed here when we know this is the case.
6617       InstWidening Decision = getWideningDecision(I, VF);
6618       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6619         // Scalarize a widened load of address.
6620         setWideningDecision(
6621             I, VF, CM_Scalarize,
6622             (VF.getKnownMinValue() *
6623              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6624       else if (auto Group = getInterleavedAccessGroup(I)) {
6625         // Scalarize an interleave group of address loads.
6626         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6627           if (Instruction *Member = Group->getMember(I))
6628             setWideningDecision(
6629                 Member, VF, CM_Scalarize,
6630                 (VF.getKnownMinValue() *
6631                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6632         }
6633       }
6634     } else
6635       // Make sure I gets scalarized and a cost estimate without
6636       // scalarization overhead.
6637       ForcedScalars[VF].insert(I);
6638   }
6639 }
6640 
6641 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6642   assert(!VF.isScalar() &&
6643          "Trying to set a vectorization decision for a scalar VF");
6644 
6645   for (BasicBlock *BB : TheLoop->blocks()) {
6646     // For each instruction in the old loop.
6647     for (Instruction &I : *BB) {
6648       CallInst *CI = dyn_cast<CallInst>(&I);
6649 
6650       if (!CI)
6651         continue;
6652 
6653       InstructionCost ScalarCost = InstructionCost::getInvalid();
6654       InstructionCost VectorCost = InstructionCost::getInvalid();
6655       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6656       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6657 
6658       Function *ScalarFunc = CI->getCalledFunction();
6659       Type *ScalarRetTy = CI->getType();
6660       SmallVector<Type *, 4> Tys, ScalarTys;
6661       bool MaskRequired = Legal->isMaskRequired(CI);
6662       for (auto &ArgOp : CI->args())
6663         ScalarTys.push_back(ArgOp->getType());
6664 
6665       // Compute corresponding vector type for return value and arguments.
6666       Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6667       for (Type *ScalarTy : ScalarTys)
6668         Tys.push_back(ToVectorTy(ScalarTy, VF));
6669 
6670       // An in-loop reduction using an fmuladd intrinsic is a special case;
6671       // we don't want the normal cost for that intrinsic.
6672       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6673         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6674           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6675                                   getVectorIntrinsicIDForCall(CI, TLI),
6676                                   std::nullopt, *RedCost);
6677           continue;
6678         }
6679 
6680       // Estimate cost of scalarized vector call. The source operands are
6681       // assumed to be vectors, so we need to extract individual elements from
6682       // there, execute VF scalar calls, and then gather the result into the
6683       // vector return value.
6684       InstructionCost ScalarCallCost =
6685           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6686 
6687       // Compute costs of unpacking argument values for the scalar calls and
6688       // packing the return values to a vector.
6689       InstructionCost ScalarizationCost =
6690           getScalarizationOverhead(CI, VF, CostKind);
6691 
6692       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6693 
6694       // Find the cost of vectorizing the call, if we can find a suitable
6695       // vector variant of the function.
6696       bool UsesMask = false;
6697       VFInfo FuncInfo;
6698       Function *VecFunc = nullptr;
6699       // Search through any available variants for one we can use at this VF.
6700       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6701         // Must match requested VF.
6702         if (Info.Shape.VF != VF)
6703           continue;
6704 
6705         // Must take a mask argument if one is required
6706         if (MaskRequired && !Info.isMasked())
6707           continue;
6708 
6709         // Check that all parameter kinds are supported
6710         bool ParamsOk = true;
6711         for (VFParameter Param : Info.Shape.Parameters) {
6712           switch (Param.ParamKind) {
6713           case VFParamKind::Vector:
6714             break;
6715           case VFParamKind::OMP_Uniform: {
6716             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6717             // Make sure the scalar parameter in the loop is invariant.
6718             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6719                                               TheLoop))
6720               ParamsOk = false;
6721             break;
6722           }
6723           case VFParamKind::OMP_Linear: {
6724             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6725             // Find the stride for the scalar parameter in this loop and see if
6726             // it matches the stride for the variant.
6727             // TODO: do we need to figure out the cost of an extract to get the
6728             // first lane? Or do we hope that it will be folded away?
6729             ScalarEvolution *SE = PSE.getSE();
6730             const auto *SAR =
6731                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6732 
6733             if (!SAR || SAR->getLoop() != TheLoop) {
6734               ParamsOk = false;
6735               break;
6736             }
6737 
6738             const SCEVConstant *Step =
6739                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6740 
6741             if (!Step ||
6742                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6743               ParamsOk = false;
6744 
6745             break;
6746           }
6747           case VFParamKind::GlobalPredicate:
6748             UsesMask = true;
6749             break;
6750           default:
6751             ParamsOk = false;
6752             break;
6753           }
6754         }
6755 
6756         if (!ParamsOk)
6757           continue;
6758 
6759         // Found a suitable candidate, stop here.
6760         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6761         FuncInfo = Info;
6762         break;
6763       }
6764 
6765       // Add in the cost of synthesizing a mask if one wasn't required.
6766       InstructionCost MaskCost = 0;
6767       if (VecFunc && UsesMask && !MaskRequired)
6768         MaskCost = TTI.getShuffleCost(
6769             TargetTransformInfo::SK_Broadcast,
6770             VectorType::get(IntegerType::getInt1Ty(
6771                                 VecFunc->getFunctionType()->getContext()),
6772                             VF));
6773 
6774       if (TLI && VecFunc && !CI->isNoBuiltin())
6775         VectorCost =
6776             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6777 
6778       // Find the cost of an intrinsic; some targets may have instructions that
6779       // perform the operation without needing an actual call.
6780       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6781       if (IID != Intrinsic::not_intrinsic)
6782         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6783 
6784       InstructionCost Cost = ScalarCost;
6785       InstWidening Decision = CM_Scalarize;
6786 
6787       if (VectorCost <= Cost) {
6788         Cost = VectorCost;
6789         Decision = CM_VectorCall;
6790       }
6791 
6792       if (IntrinsicCost <= Cost) {
6793         Cost = IntrinsicCost;
6794         Decision = CM_IntrinsicCall;
6795       }
6796 
6797       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6798                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6799     }
6800   }
6801 }
6802 
6803 InstructionCost
6804 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6805                                                Type *&VectorTy) {
6806   Type *RetTy = I->getType();
6807   if (canTruncateToMinimalBitwidth(I, VF))
6808     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6809   auto SE = PSE.getSE();
6810   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6811 
6812   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6813                                                 ElementCount VF) -> bool {
6814     if (VF.isScalar())
6815       return true;
6816 
6817     auto Scalarized = InstsToScalarize.find(VF);
6818     assert(Scalarized != InstsToScalarize.end() &&
6819            "VF not yet analyzed for scalarization profitability");
6820     return !Scalarized->second.count(I) &&
6821            llvm::all_of(I->users(), [&](User *U) {
6822              auto *UI = cast<Instruction>(U);
6823              return !Scalarized->second.count(UI);
6824            });
6825   };
6826   (void) hasSingleCopyAfterVectorization;
6827 
6828   if (isScalarAfterVectorization(I, VF)) {
6829     // With the exception of GEPs and PHIs, after scalarization there should
6830     // only be one copy of the instruction generated in the loop. This is
6831     // because the VF is either 1, or any instructions that need scalarizing
6832     // have already been dealt with by the time we get here. As a result,
6833     // it means we don't have to multiply the instruction cost by VF.
6834     assert(I->getOpcode() == Instruction::GetElementPtr ||
6835            I->getOpcode() == Instruction::PHI ||
6836            (I->getOpcode() == Instruction::BitCast &&
6837             I->getType()->isPointerTy()) ||
6838            hasSingleCopyAfterVectorization(I, VF));
6839     VectorTy = RetTy;
6840   } else
6841     VectorTy = ToVectorTy(RetTy, VF);
6842 
6843   // TODO: We need to estimate the cost of intrinsic calls.
6844   switch (I->getOpcode()) {
6845   case Instruction::GetElementPtr:
6846     // We mark this instruction as zero-cost because the cost of GEPs in
6847     // vectorized code depends on whether the corresponding memory instruction
6848     // is scalarized or not. Therefore, we handle GEPs with the memory
6849     // instruction cost.
6850     return 0;
6851   case Instruction::Br: {
6852     // In cases of scalarized and predicated instructions, there will be VF
6853     // predicated blocks in the vectorized loop. Each branch around these
6854     // blocks requires also an extract of its vector compare i1 element.
6855     bool ScalarPredicatedBB = false;
6856     BranchInst *BI = cast<BranchInst>(I);
6857     if (VF.isVector() && BI->isConditional() &&
6858         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6859          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6860       ScalarPredicatedBB = true;
6861 
6862     if (ScalarPredicatedBB) {
6863       // Not possible to scalarize scalable vector with predicated instructions.
6864       if (VF.isScalable())
6865         return InstructionCost::getInvalid();
6866       // Return cost for branches around scalarized and predicated blocks.
6867       auto *Vec_i1Ty =
6868           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6869       return (
6870           TTI.getScalarizationOverhead(
6871               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6872               /*Insert*/ false, /*Extract*/ true, CostKind) +
6873           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6874     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6875       // The back-edge branch will remain, as will all scalar branches.
6876       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6877     else
6878       // This branch will be eliminated by if-conversion.
6879       return 0;
6880     // Note: We currently assume zero cost for an unconditional branch inside
6881     // a predicated block since it will become a fall-through, although we
6882     // may decide in the future to call TTI for all branches.
6883   }
6884   case Instruction::PHI: {
6885     auto *Phi = cast<PHINode>(I);
6886 
6887     // First-order recurrences are replaced by vector shuffles inside the loop.
6888     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6889       SmallVector<int> Mask(VF.getKnownMinValue());
6890       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6891       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6892                                 cast<VectorType>(VectorTy), Mask, CostKind,
6893                                 VF.getKnownMinValue() - 1);
6894     }
6895 
6896     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6897     // converted into select instructions. We require N - 1 selects per phi
6898     // node, where N is the number of incoming values.
6899     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6900       return (Phi->getNumIncomingValues() - 1) *
6901              TTI.getCmpSelInstrCost(
6902                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6903                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6904                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6905 
6906     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6907   }
6908   case Instruction::UDiv:
6909   case Instruction::SDiv:
6910   case Instruction::URem:
6911   case Instruction::SRem:
6912     if (VF.isVector() && isPredicatedInst(I)) {
6913       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6914       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6915         ScalarCost : SafeDivisorCost;
6916     }
6917     // We've proven all lanes safe to speculate, fall through.
6918     [[fallthrough]];
6919   case Instruction::Add:
6920   case Instruction::FAdd:
6921   case Instruction::Sub:
6922   case Instruction::FSub:
6923   case Instruction::Mul:
6924   case Instruction::FMul:
6925   case Instruction::FDiv:
6926   case Instruction::FRem:
6927   case Instruction::Shl:
6928   case Instruction::LShr:
6929   case Instruction::AShr:
6930   case Instruction::And:
6931   case Instruction::Or:
6932   case Instruction::Xor: {
6933     // If we're speculating on the stride being 1, the multiplication may
6934     // fold away.  We can generalize this for all operations using the notion
6935     // of neutral elements.  (TODO)
6936     if (I->getOpcode() == Instruction::Mul &&
6937         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6938          PSE.getSCEV(I->getOperand(1))->isOne()))
6939       return 0;
6940 
6941     // Detect reduction patterns
6942     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6943       return *RedCost;
6944 
6945     // Certain instructions can be cheaper to vectorize if they have a constant
6946     // second vector operand. One example of this are shifts on x86.
6947     Value *Op2 = I->getOperand(1);
6948     auto Op2Info = TTI.getOperandInfo(Op2);
6949     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6950         Legal->isInvariant(Op2))
6951       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6952 
6953     SmallVector<const Value *, 4> Operands(I->operand_values());
6954     return TTI.getArithmeticInstrCost(
6955         I->getOpcode(), VectorTy, CostKind,
6956         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6957         Op2Info, Operands, I);
6958   }
6959   case Instruction::FNeg: {
6960     return TTI.getArithmeticInstrCost(
6961         I->getOpcode(), VectorTy, CostKind,
6962         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6963         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6964         I->getOperand(0), I);
6965   }
6966   case Instruction::Select: {
6967     SelectInst *SI = cast<SelectInst>(I);
6968     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6969     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6970 
6971     const Value *Op0, *Op1;
6972     using namespace llvm::PatternMatch;
6973     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6974                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6975       // select x, y, false --> x & y
6976       // select x, true, y --> x | y
6977       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6978       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6979       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6980               Op1->getType()->getScalarSizeInBits() == 1);
6981 
6982       SmallVector<const Value *, 2> Operands{Op0, Op1};
6983       return TTI.getArithmeticInstrCost(
6984           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6985           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6986     }
6987 
6988     Type *CondTy = SI->getCondition()->getType();
6989     if (!ScalarCond)
6990       CondTy = VectorType::get(CondTy, VF);
6991 
6992     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6993     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6994       Pred = Cmp->getPredicate();
6995     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6996                                   CostKind, I);
6997   }
6998   case Instruction::ICmp:
6999   case Instruction::FCmp: {
7000     Type *ValTy = I->getOperand(0)->getType();
7001     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7002     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7003       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7004     VectorTy = ToVectorTy(ValTy, VF);
7005     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7006                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7007                                   I);
7008   }
7009   case Instruction::Store:
7010   case Instruction::Load: {
7011     ElementCount Width = VF;
7012     if (Width.isVector()) {
7013       InstWidening Decision = getWideningDecision(I, Width);
7014       assert(Decision != CM_Unknown &&
7015              "CM decision should be taken at this point");
7016       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7017         return InstructionCost::getInvalid();
7018       if (Decision == CM_Scalarize)
7019         Width = ElementCount::getFixed(1);
7020     }
7021     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7022     return getMemoryInstructionCost(I, VF);
7023   }
7024   case Instruction::BitCast:
7025     if (I->getType()->isPointerTy())
7026       return 0;
7027     [[fallthrough]];
7028   case Instruction::ZExt:
7029   case Instruction::SExt:
7030   case Instruction::FPToUI:
7031   case Instruction::FPToSI:
7032   case Instruction::FPExt:
7033   case Instruction::PtrToInt:
7034   case Instruction::IntToPtr:
7035   case Instruction::SIToFP:
7036   case Instruction::UIToFP:
7037   case Instruction::Trunc:
7038   case Instruction::FPTrunc: {
7039     // Computes the CastContextHint from a Load/Store instruction.
7040     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7041       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7042              "Expected a load or a store!");
7043 
7044       if (VF.isScalar() || !TheLoop->contains(I))
7045         return TTI::CastContextHint::Normal;
7046 
7047       switch (getWideningDecision(I, VF)) {
7048       case LoopVectorizationCostModel::CM_GatherScatter:
7049         return TTI::CastContextHint::GatherScatter;
7050       case LoopVectorizationCostModel::CM_Interleave:
7051         return TTI::CastContextHint::Interleave;
7052       case LoopVectorizationCostModel::CM_Scalarize:
7053       case LoopVectorizationCostModel::CM_Widen:
7054         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7055                                         : TTI::CastContextHint::Normal;
7056       case LoopVectorizationCostModel::CM_Widen_Reverse:
7057         return TTI::CastContextHint::Reversed;
7058       case LoopVectorizationCostModel::CM_Unknown:
7059         llvm_unreachable("Instr did not go through cost modelling?");
7060       case LoopVectorizationCostModel::CM_VectorCall:
7061       case LoopVectorizationCostModel::CM_IntrinsicCall:
7062         llvm_unreachable_internal("Instr has invalid widening decision");
7063       }
7064 
7065       llvm_unreachable("Unhandled case!");
7066     };
7067 
7068     unsigned Opcode = I->getOpcode();
7069     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7070     // For Trunc, the context is the only user, which must be a StoreInst.
7071     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7072       if (I->hasOneUse())
7073         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7074           CCH = ComputeCCH(Store);
7075     }
7076     // For Z/Sext, the context is the operand, which must be a LoadInst.
7077     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7078              Opcode == Instruction::FPExt) {
7079       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7080         CCH = ComputeCCH(Load);
7081     }
7082 
7083     // We optimize the truncation of induction variables having constant
7084     // integer steps. The cost of these truncations is the same as the scalar
7085     // operation.
7086     if (isOptimizableIVTruncate(I, VF)) {
7087       auto *Trunc = cast<TruncInst>(I);
7088       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7089                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7090     }
7091 
7092     // Detect reduction patterns
7093     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7094       return *RedCost;
7095 
7096     Type *SrcScalarTy = I->getOperand(0)->getType();
7097     Type *SrcVecTy =
7098         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7099     if (canTruncateToMinimalBitwidth(I, VF)) {
7100       // This cast is going to be shrunk. This may remove the cast or it might
7101       // turn it into slightly different cast. For example, if MinBW == 16,
7102       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7103       //
7104       // Calculate the modified src and dest types.
7105       Type *MinVecTy = VectorTy;
7106       if (Opcode == Instruction::Trunc) {
7107         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7108         VectorTy =
7109             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7110       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7111         // Leave SrcVecTy unchanged - we only shrink the destination element
7112         // type.
7113         VectorTy =
7114             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7115       }
7116     }
7117 
7118     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7119   }
7120   case Instruction::Call:
7121     return getVectorCallCost(cast<CallInst>(I), VF);
7122   case Instruction::ExtractValue:
7123     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7124   case Instruction::Alloca:
7125     // We cannot easily widen alloca to a scalable alloca, as
7126     // the result would need to be a vector of pointers.
7127     if (VF.isScalable())
7128       return InstructionCost::getInvalid();
7129     [[fallthrough]];
7130   default:
7131     // This opcode is unknown. Assume that it is the same as 'mul'.
7132     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7133   } // end of switch.
7134 }
7135 
7136 void LoopVectorizationCostModel::collectValuesToIgnore() {
7137   // Ignore ephemeral values.
7138   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7139 
7140   // Find all stores to invariant variables. Since they are going to sink
7141   // outside the loop we do not need calculate cost for them.
7142   for (BasicBlock *BB : TheLoop->blocks())
7143     for (Instruction &I : *BB) {
7144       StoreInst *SI;
7145       if ((SI = dyn_cast<StoreInst>(&I)) &&
7146           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7147         ValuesToIgnore.insert(&I);
7148     }
7149 
7150   // Ignore type-promoting instructions we identified during reduction
7151   // detection.
7152   for (const auto &Reduction : Legal->getReductionVars()) {
7153     const RecurrenceDescriptor &RedDes = Reduction.second;
7154     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7155     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7156   }
7157   // Ignore type-casting instructions we identified during induction
7158   // detection.
7159   for (const auto &Induction : Legal->getInductionVars()) {
7160     const InductionDescriptor &IndDes = Induction.second;
7161     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7162     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7163   }
7164 }
7165 
7166 void LoopVectorizationCostModel::collectInLoopReductions() {
7167   for (const auto &Reduction : Legal->getReductionVars()) {
7168     PHINode *Phi = Reduction.first;
7169     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7170 
7171     // We don't collect reductions that are type promoted (yet).
7172     if (RdxDesc.getRecurrenceType() != Phi->getType())
7173       continue;
7174 
7175     // If the target would prefer this reduction to happen "in-loop", then we
7176     // want to record it as such.
7177     unsigned Opcode = RdxDesc.getOpcode();
7178     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7179         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7180                                    TargetTransformInfo::ReductionFlags()))
7181       continue;
7182 
7183     // Check that we can correctly put the reductions into the loop, by
7184     // finding the chain of operations that leads from the phi to the loop
7185     // exit value.
7186     SmallVector<Instruction *, 4> ReductionOperations =
7187         RdxDesc.getReductionOpChain(Phi, TheLoop);
7188     bool InLoop = !ReductionOperations.empty();
7189 
7190     if (InLoop) {
7191       InLoopReductions.insert(Phi);
7192       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7193       Instruction *LastChain = Phi;
7194       for (auto *I : ReductionOperations) {
7195         InLoopReductionImmediateChains[I] = LastChain;
7196         LastChain = I;
7197       }
7198     }
7199     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7200                       << " reduction for phi: " << *Phi << "\n");
7201   }
7202 }
7203 
7204 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
7205                                DebugLoc DL, const Twine &Name) {
7206   assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7207          Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7208   return tryInsertInstruction(
7209       new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7210 }
7211 
7212 // This function will select a scalable VF if the target supports scalable
7213 // vectors and a fixed one otherwise.
7214 // TODO: we could return a pair of values that specify the max VF and
7215 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7216 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7217 // doesn't have a cost model that can choose which plan to execute if
7218 // more than one is generated.
7219 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7220                                      LoopVectorizationCostModel &CM) {
7221   unsigned WidestType;
7222   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7223 
7224   TargetTransformInfo::RegisterKind RegKind =
7225       TTI.enableScalableVectorization()
7226           ? TargetTransformInfo::RGK_ScalableVector
7227           : TargetTransformInfo::RGK_FixedWidthVector;
7228 
7229   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7230   unsigned N = RegSize.getKnownMinValue() / WidestType;
7231   return ElementCount::get(N, RegSize.isScalable());
7232 }
7233 
7234 VectorizationFactor
7235 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7236   ElementCount VF = UserVF;
7237   // Outer loop handling: They may require CFG and instruction level
7238   // transformations before even evaluating whether vectorization is profitable.
7239   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7240   // the vectorization pipeline.
7241   if (!OrigLoop->isInnermost()) {
7242     // If the user doesn't provide a vectorization factor, determine a
7243     // reasonable one.
7244     if (UserVF.isZero()) {
7245       VF = determineVPlanVF(TTI, CM);
7246       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7247 
7248       // Make sure we have a VF > 1 for stress testing.
7249       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7250         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7251                           << "overriding computed VF.\n");
7252         VF = ElementCount::getFixed(4);
7253       }
7254     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7255                !ForceTargetSupportsScalableVectors) {
7256       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7257                         << "not supported by the target.\n");
7258       reportVectorizationFailure(
7259           "Scalable vectorization requested but not supported by the target",
7260           "the scalable user-specified vectorization width for outer-loop "
7261           "vectorization cannot be used because the target does not support "
7262           "scalable vectors.",
7263           "ScalableVFUnfeasible", ORE, OrigLoop);
7264       return VectorizationFactor::Disabled();
7265     }
7266     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7267     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7268            "VF needs to be a power of two");
7269     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7270                       << "VF " << VF << " to build VPlans.\n");
7271     buildVPlans(VF, VF);
7272 
7273     // For VPlan build stress testing, we bail out after VPlan construction.
7274     if (VPlanBuildStressTest)
7275       return VectorizationFactor::Disabled();
7276 
7277     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7278   }
7279 
7280   LLVM_DEBUG(
7281       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7282                 "VPlan-native path.\n");
7283   return VectorizationFactor::Disabled();
7284 }
7285 
7286 std::optional<VectorizationFactor>
7287 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7288   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7289   CM.collectValuesToIgnore();
7290   CM.collectElementTypesForWidening();
7291 
7292   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7293   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7294     return std::nullopt;
7295 
7296   // Invalidate interleave groups if all blocks of loop will be predicated.
7297   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7298       !useMaskedInterleavedAccesses(TTI)) {
7299     LLVM_DEBUG(
7300         dbgs()
7301         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7302            "which requires masked-interleaved support.\n");
7303     if (CM.InterleaveInfo.invalidateGroups())
7304       // Invalidating interleave groups also requires invalidating all decisions
7305       // based on them, which includes widening decisions and uniform and scalar
7306       // values.
7307       CM.invalidateCostModelingDecisions();
7308   }
7309 
7310   ElementCount MaxUserVF =
7311       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7312   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7313   if (!UserVF.isZero() && UserVFIsLegal) {
7314     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7315            "VF needs to be a power of two");
7316     // Collect the instructions (and their associated costs) that will be more
7317     // profitable to scalarize.
7318     CM.collectInLoopReductions();
7319     if (CM.selectUserVectorizationFactor(UserVF)) {
7320       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7321       buildVPlansWithVPRecipes(UserVF, UserVF);
7322       if (!hasPlanWithVF(UserVF)) {
7323         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7324                           << ".\n");
7325         return std::nullopt;
7326       }
7327 
7328       LLVM_DEBUG(printPlans(dbgs()));
7329       return {{UserVF, 0, 0}};
7330     } else
7331       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7332                               "InvalidCost", ORE, OrigLoop);
7333   }
7334 
7335   // Populate the set of Vectorization Factor Candidates.
7336   ElementCountSet VFCandidates;
7337   for (auto VF = ElementCount::getFixed(1);
7338        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7339     VFCandidates.insert(VF);
7340   for (auto VF = ElementCount::getScalable(1);
7341        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7342     VFCandidates.insert(VF);
7343 
7344   CM.collectInLoopReductions();
7345   for (const auto &VF : VFCandidates) {
7346     // Collect Uniform and Scalar instructions after vectorization with VF.
7347     CM.collectUniformsAndScalars(VF);
7348 
7349     // Collect the instructions (and their associated costs) that will be more
7350     // profitable to scalarize.
7351     if (VF.isVector())
7352       CM.collectInstsToScalarize(VF);
7353   }
7354 
7355   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7356   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7357 
7358   LLVM_DEBUG(printPlans(dbgs()));
7359   if (!MaxFactors.hasVector())
7360     return VectorizationFactor::Disabled();
7361 
7362   // Select the optimal vectorization factor.
7363   VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7364   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7365   if (!hasPlanWithVF(VF.Width)) {
7366     LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7367                       << ".\n");
7368     return std::nullopt;
7369   }
7370   return VF;
7371 }
7372 
7373 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7374   assert(count_if(VPlans,
7375                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7376              1 &&
7377          "Best VF has not a single VPlan.");
7378 
7379   for (const VPlanPtr &Plan : VPlans) {
7380     if (Plan->hasVF(VF))
7381       return *Plan.get();
7382   }
7383   llvm_unreachable("No plan found!");
7384 }
7385 
7386 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7387   SmallVector<Metadata *, 4> MDs;
7388   // Reserve first location for self reference to the LoopID metadata node.
7389   MDs.push_back(nullptr);
7390   bool IsUnrollMetadata = false;
7391   MDNode *LoopID = L->getLoopID();
7392   if (LoopID) {
7393     // First find existing loop unrolling disable metadata.
7394     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7395       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7396       if (MD) {
7397         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7398         IsUnrollMetadata =
7399             S && S->getString().starts_with("llvm.loop.unroll.disable");
7400       }
7401       MDs.push_back(LoopID->getOperand(i));
7402     }
7403   }
7404 
7405   if (!IsUnrollMetadata) {
7406     // Add runtime unroll disable metadata.
7407     LLVMContext &Context = L->getHeader()->getContext();
7408     SmallVector<Metadata *, 1> DisableOperands;
7409     DisableOperands.push_back(
7410         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7411     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7412     MDs.push_back(DisableNode);
7413     MDNode *NewLoopID = MDNode::get(Context, MDs);
7414     // Set operand 0 to refer to the loop id itself.
7415     NewLoopID->replaceOperandWith(0, NewLoopID);
7416     L->setLoopID(NewLoopID);
7417   }
7418 }
7419 
7420 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7421 // create a merge phi node for it and add it to \p ReductionResumeValues.
7422 static void createAndCollectMergePhiForReduction(
7423     VPInstruction *RedResult,
7424     DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7425     VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7426   if (!RedResult ||
7427       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7428     return;
7429 
7430   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7431   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7432 
7433   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7434   Value *FinalValue =
7435       State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7436   auto *ResumePhi =
7437       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7438 
7439   // TODO: bc.merge.rdx should not be created here, instead it should be
7440   // modeled in VPlan.
7441   BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7442   // Create a phi node that merges control-flow from the backedge-taken check
7443   // block and the middle block.
7444   auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7445                                      LoopScalarPreHeader->getTerminator());
7446 
7447   // If we are fixing reductions in the epilogue loop then we should already
7448   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7449   // we carry over the incoming values correctly.
7450   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7451     if (Incoming == LoopMiddleBlock)
7452       BCBlockPhi->addIncoming(FinalValue, Incoming);
7453     else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7454       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7455                               Incoming);
7456     else
7457       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7458   }
7459 
7460   auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7461   // TODO: This fixup should instead be modeled in VPlan.
7462   // Fix the scalar loop reduction variable with the incoming reduction sum
7463   // from the vector body and from the backedge value.
7464   int IncomingEdgeBlockIdx =
7465       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7466   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7467   // Pick the other block.
7468   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7469   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7470   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7471   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7472 
7473   ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7474 }
7475 
7476 std::pair<DenseMap<const SCEV *, Value *>,
7477           DenseMap<const RecurrenceDescriptor *, Value *>>
7478 LoopVectorizationPlanner::executePlan(
7479     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7480     InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7481     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7482   assert(BestVPlan.hasVF(BestVF) &&
7483          "Trying to execute plan with unsupported VF");
7484   assert(BestVPlan.hasUF(BestUF) &&
7485          "Trying to execute plan with unsupported UF");
7486   assert(
7487       (IsEpilogueVectorization || !ExpandedSCEVs) &&
7488       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7489 
7490   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7491                     << '\n');
7492 
7493   if (!IsEpilogueVectorization)
7494     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7495 
7496   // Perform the actual loop transformation.
7497   VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7498                          OrigLoop->getHeader()->getContext());
7499 
7500   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7501   // before making any changes to the CFG.
7502   if (!BestVPlan.getPreheader()->empty()) {
7503     State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7504     State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7505     BestVPlan.getPreheader()->execute(&State);
7506   }
7507   if (!ILV.getTripCount())
7508     ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7509   else
7510     assert(IsEpilogueVectorization && "should only re-use the existing trip "
7511                                       "count during epilogue vectorization");
7512 
7513   // 1. Set up the skeleton for vectorization, including vector pre-header and
7514   // middle block. The vector loop is created during VPlan execution.
7515   Value *CanonicalIVStartValue;
7516   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7517       ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7518                                                      : State.ExpandedSCEVs);
7519 
7520   // Only use noalias metadata when using memory checks guaranteeing no overlap
7521   // across all iterations.
7522   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7523   std::unique_ptr<LoopVersioning> LVer = nullptr;
7524   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7525       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7526 
7527     //  We currently don't use LoopVersioning for the actual loop cloning but we
7528     //  still use it to add the noalias metadata.
7529     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7530     //        metadata.
7531     LVer = std::make_unique<LoopVersioning>(
7532         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7533         PSE.getSE());
7534     State.LVer = &*LVer;
7535     State.LVer->prepareNoAliasMetadata();
7536   }
7537 
7538   ILV.collectPoisonGeneratingRecipes(State);
7539 
7540   ILV.printDebugTracesAtStart();
7541 
7542   //===------------------------------------------------===//
7543   //
7544   // Notice: any optimization or new instruction that go
7545   // into the code below should also be implemented in
7546   // the cost-model.
7547   //
7548   //===------------------------------------------------===//
7549 
7550   // 2. Copy and widen instructions from the old loop into the new loop.
7551   BestVPlan.prepareToExecute(ILV.getTripCount(),
7552                              ILV.getOrCreateVectorTripCount(nullptr),
7553                              CanonicalIVStartValue, State);
7554 
7555   BestVPlan.execute(&State);
7556 
7557   // 2.5 Collect reduction resume values.
7558   DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7559   auto *ExitVPBB =
7560       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7561   for (VPRecipeBase &R : *ExitVPBB) {
7562     createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7563                                          ReductionResumeValues, State, OrigLoop,
7564                                          State.CFG.VPBB2IRBB[ExitVPBB]);
7565   }
7566 
7567   // 2.6. Maintain Loop Hints
7568   // Keep all loop hints from the original loop on the vector loop (we'll
7569   // replace the vectorizer-specific hints below).
7570   MDNode *OrigLoopID = OrigLoop->getLoopID();
7571 
7572   std::optional<MDNode *> VectorizedLoopID =
7573       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7574                                       LLVMLoopVectorizeFollowupVectorized});
7575 
7576   VPBasicBlock *HeaderVPBB =
7577       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7578   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7579   if (VectorizedLoopID)
7580     L->setLoopID(*VectorizedLoopID);
7581   else {
7582     // Keep all loop hints from the original loop on the vector loop (we'll
7583     // replace the vectorizer-specific hints below).
7584     if (MDNode *LID = OrigLoop->getLoopID())
7585       L->setLoopID(LID);
7586 
7587     LoopVectorizeHints Hints(L, true, *ORE);
7588     Hints.setAlreadyVectorized();
7589   }
7590   TargetTransformInfo::UnrollingPreferences UP;
7591   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7592   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7593     AddRuntimeUnrollDisableMetaData(L);
7594 
7595   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7596   //    predication, updating analyses.
7597   ILV.fixVectorizedLoop(State, BestVPlan);
7598 
7599   ILV.printDebugTracesAtEnd();
7600 
7601   return {State.ExpandedSCEVs, ReductionResumeValues};
7602 }
7603 
7604 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7605 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7606   for (const auto &Plan : VPlans)
7607     if (PrintVPlansInDotFormat)
7608       Plan->printDOT(O);
7609     else
7610       Plan->print(O);
7611 }
7612 #endif
7613 
7614 //===--------------------------------------------------------------------===//
7615 // EpilogueVectorizerMainLoop
7616 //===--------------------------------------------------------------------===//
7617 
7618 /// This function is partially responsible for generating the control flow
7619 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7620 std::pair<BasicBlock *, Value *>
7621 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7622     const SCEV2ValueTy &ExpandedSCEVs) {
7623   createVectorLoopSkeleton("");
7624 
7625   // Generate the code to check the minimum iteration count of the vector
7626   // epilogue (see below).
7627   EPI.EpilogueIterationCountCheck =
7628       emitIterationCountCheck(LoopScalarPreHeader, true);
7629   EPI.EpilogueIterationCountCheck->setName("iter.check");
7630 
7631   // Generate the code to check any assumptions that we've made for SCEV
7632   // expressions.
7633   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7634 
7635   // Generate the code that checks at runtime if arrays overlap. We put the
7636   // checks into a separate block to make the more common case of few elements
7637   // faster.
7638   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7639 
7640   // Generate the iteration count check for the main loop, *after* the check
7641   // for the epilogue loop, so that the path-length is shorter for the case
7642   // that goes directly through the vector epilogue. The longer-path length for
7643   // the main loop is compensated for, by the gain from vectorizing the larger
7644   // trip count. Note: the branch will get updated later on when we vectorize
7645   // the epilogue.
7646   EPI.MainLoopIterationCountCheck =
7647       emitIterationCountCheck(LoopScalarPreHeader, false);
7648 
7649   // Generate the induction variable.
7650   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7651 
7652   // Skip induction resume value creation here because they will be created in
7653   // the second pass for the scalar loop. The induction resume values for the
7654   // inductions in the epilogue loop are created before executing the plan for
7655   // the epilogue loop.
7656 
7657   return {completeLoopSkeleton(), nullptr};
7658 }
7659 
7660 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7661   LLVM_DEBUG({
7662     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7663            << "Main Loop VF:" << EPI.MainLoopVF
7664            << ", Main Loop UF:" << EPI.MainLoopUF
7665            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7666            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7667   });
7668 }
7669 
7670 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7671   DEBUG_WITH_TYPE(VerboseDebug, {
7672     dbgs() << "intermediate fn:\n"
7673            << *OrigLoop->getHeader()->getParent() << "\n";
7674   });
7675 }
7676 
7677 BasicBlock *
7678 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7679                                                     bool ForEpilogue) {
7680   assert(Bypass && "Expected valid bypass basic block.");
7681   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7682   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7683   Value *Count = getTripCount();
7684   // Reuse existing vector loop preheader for TC checks.
7685   // Note that new preheader block is generated for vector loop.
7686   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7687   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7688 
7689   // Generate code to check if the loop's trip count is less than VF * UF of the
7690   // main vector loop.
7691   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7692                                                     : VF.isVector())
7693                ? ICmpInst::ICMP_ULE
7694                : ICmpInst::ICMP_ULT;
7695 
7696   Value *CheckMinIters = Builder.CreateICmp(
7697       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7698       "min.iters.check");
7699 
7700   if (!ForEpilogue)
7701     TCCheckBlock->setName("vector.main.loop.iter.check");
7702 
7703   // Create new preheader for vector loop.
7704   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7705                                    DT, LI, nullptr, "vector.ph");
7706 
7707   if (ForEpilogue) {
7708     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7709                                  DT->getNode(Bypass)->getIDom()) &&
7710            "TC check is expected to dominate Bypass");
7711 
7712     // Update dominator for Bypass & LoopExit.
7713     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7714     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7715       // For loops with multiple exits, there's no edge from the middle block
7716       // to exit blocks (as the epilogue must run) and thus no need to update
7717       // the immediate dominator of the exit blocks.
7718       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7719 
7720     LoopBypassBlocks.push_back(TCCheckBlock);
7721 
7722     // Save the trip count so we don't have to regenerate it in the
7723     // vec.epilog.iter.check. This is safe to do because the trip count
7724     // generated here dominates the vector epilog iter check.
7725     EPI.TripCount = Count;
7726   }
7727 
7728   BranchInst &BI =
7729       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7730   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7731     setBranchWeights(BI, MinItersBypassWeights);
7732   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7733 
7734   return TCCheckBlock;
7735 }
7736 
7737 //===--------------------------------------------------------------------===//
7738 // EpilogueVectorizerEpilogueLoop
7739 //===--------------------------------------------------------------------===//
7740 
7741 /// This function is partially responsible for generating the control flow
7742 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7743 std::pair<BasicBlock *, Value *>
7744 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7745     const SCEV2ValueTy &ExpandedSCEVs) {
7746   createVectorLoopSkeleton("vec.epilog.");
7747 
7748   // Now, compare the remaining count and if there aren't enough iterations to
7749   // execute the vectorized epilogue skip to the scalar part.
7750   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7751   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7752   LoopVectorPreHeader =
7753       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7754                  LI, nullptr, "vec.epilog.ph");
7755   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7756                                           VecEpilogueIterationCountCheck);
7757 
7758   // Adjust the control flow taking the state info from the main loop
7759   // vectorization into account.
7760   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7761          "expected this to be saved from the previous pass.");
7762   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7763       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7764 
7765   DT->changeImmediateDominator(LoopVectorPreHeader,
7766                                EPI.MainLoopIterationCountCheck);
7767 
7768   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7769       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7770 
7771   if (EPI.SCEVSafetyCheck)
7772     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7773         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7774   if (EPI.MemSafetyCheck)
7775     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7776         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7777 
7778   DT->changeImmediateDominator(
7779       VecEpilogueIterationCountCheck,
7780       VecEpilogueIterationCountCheck->getSinglePredecessor());
7781 
7782   DT->changeImmediateDominator(LoopScalarPreHeader,
7783                                EPI.EpilogueIterationCountCheck);
7784   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7785     // If there is an epilogue which must run, there's no edge from the
7786     // middle block to exit blocks  and thus no need to update the immediate
7787     // dominator of the exit blocks.
7788     DT->changeImmediateDominator(LoopExitBlock,
7789                                  EPI.EpilogueIterationCountCheck);
7790 
7791   // Keep track of bypass blocks, as they feed start values to the induction and
7792   // reduction phis in the scalar loop preheader.
7793   if (EPI.SCEVSafetyCheck)
7794     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7795   if (EPI.MemSafetyCheck)
7796     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7797   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7798 
7799   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7800   // reductions which merge control-flow from the latch block and the middle
7801   // block. Update the incoming values here and move the Phi into the preheader.
7802   SmallVector<PHINode *, 4> PhisInBlock;
7803   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7804     PhisInBlock.push_back(&Phi);
7805 
7806   for (PHINode *Phi : PhisInBlock) {
7807     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7808     Phi->replaceIncomingBlockWith(
7809         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7810         VecEpilogueIterationCountCheck);
7811 
7812     // If the phi doesn't have an incoming value from the
7813     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7814     // value and also those from other check blocks. This is needed for
7815     // reduction phis only.
7816     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7817           return EPI.EpilogueIterationCountCheck == IncB;
7818         }))
7819       continue;
7820     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7821     if (EPI.SCEVSafetyCheck)
7822       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7823     if (EPI.MemSafetyCheck)
7824       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7825   }
7826 
7827   // Generate a resume induction for the vector epilogue and put it in the
7828   // vector epilogue preheader
7829   Type *IdxTy = Legal->getWidestInductionType();
7830   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7831   EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7832   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7833   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7834                            EPI.MainLoopIterationCountCheck);
7835 
7836   // Generate induction resume values. These variables save the new starting
7837   // indexes for the scalar loop. They are used to test if there are any tail
7838   // iterations left once the vector loop has completed.
7839   // Note that when the vectorized epilogue is skipped due to iteration count
7840   // check, then the resume value for the induction variable comes from
7841   // the trip count of the main vector loop, hence passing the AdditionalBypass
7842   // argument.
7843   createInductionResumeValues(ExpandedSCEVs,
7844                               {VecEpilogueIterationCountCheck,
7845                                EPI.VectorTripCount} /* AdditionalBypass */);
7846 
7847   return {completeLoopSkeleton(), EPResumeVal};
7848 }
7849 
7850 BasicBlock *
7851 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7852     BasicBlock *Bypass, BasicBlock *Insert) {
7853 
7854   assert(EPI.TripCount &&
7855          "Expected trip count to have been safed in the first pass.");
7856   assert(
7857       (!isa<Instruction>(EPI.TripCount) ||
7858        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7859       "saved trip count does not dominate insertion point.");
7860   Value *TC = EPI.TripCount;
7861   IRBuilder<> Builder(Insert->getTerminator());
7862   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7863 
7864   // Generate code to check if the loop's trip count is less than VF * UF of the
7865   // vector epilogue loop.
7866   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7867                ? ICmpInst::ICMP_ULE
7868                : ICmpInst::ICMP_ULT;
7869 
7870   Value *CheckMinIters =
7871       Builder.CreateICmp(P, Count,
7872                          createStepForVF(Builder, Count->getType(),
7873                                          EPI.EpilogueVF, EPI.EpilogueUF),
7874                          "min.epilog.iters.check");
7875 
7876   BranchInst &BI =
7877       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7878   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7879     unsigned MainLoopStep = UF * VF.getKnownMinValue();
7880     unsigned EpilogueLoopStep =
7881         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7882     // We assume the remaining `Count` is equally distributed in
7883     // [0, MainLoopStep)
7884     // So the probability for `Count < EpilogueLoopStep` should be
7885     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7886     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7887     const uint32_t Weights[] = {EstimatedSkipCount,
7888                                 MainLoopStep - EstimatedSkipCount};
7889     setBranchWeights(BI, Weights);
7890   }
7891   ReplaceInstWithInst(Insert->getTerminator(), &BI);
7892 
7893   LoopBypassBlocks.push_back(Insert);
7894   return Insert;
7895 }
7896 
7897 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7898   LLVM_DEBUG({
7899     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7900            << "Epilogue Loop VF:" << EPI.EpilogueVF
7901            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7902   });
7903 }
7904 
7905 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7906   DEBUG_WITH_TYPE(VerboseDebug, {
7907     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7908   });
7909 }
7910 
7911 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7912     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7913   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7914   bool PredicateAtRangeStart = Predicate(Range.Start);
7915 
7916   for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7917     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7918       Range.End = TmpVF;
7919       break;
7920     }
7921 
7922   return PredicateAtRangeStart;
7923 }
7924 
7925 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7926 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7927 /// of VF's starting at a given VF and extending it as much as possible. Each
7928 /// vectorization decision can potentially shorten this sub-range during
7929 /// buildVPlan().
7930 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7931                                            ElementCount MaxVF) {
7932   auto MaxVFTimes2 = MaxVF * 2;
7933   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7934     VFRange SubRange = {VF, MaxVFTimes2};
7935     VPlans.push_back(buildVPlan(SubRange));
7936     VF = SubRange.End;
7937   }
7938 }
7939 
7940 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7941                                          VPlan &Plan) {
7942   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7943 
7944   // Look for cached value.
7945   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7946   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7947   if (ECEntryIt != EdgeMaskCache.end())
7948     return ECEntryIt->second;
7949 
7950   VPValue *SrcMask = getBlockInMask(Src);
7951 
7952   // The terminator has to be a branch inst!
7953   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7954   assert(BI && "Unexpected terminator found");
7955 
7956   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7957     return EdgeMaskCache[Edge] = SrcMask;
7958 
7959   // If source is an exiting block, we know the exit edge is dynamically dead
7960   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
7961   // adding uses of an otherwise potentially dead instruction.
7962   if (OrigLoop->isLoopExiting(Src))
7963     return EdgeMaskCache[Edge] = SrcMask;
7964 
7965   VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
7966   assert(EdgeMask && "No Edge Mask found for condition");
7967 
7968   if (BI->getSuccessor(0) != Dst)
7969     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7970 
7971   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7972     // The condition is 'SrcMask && EdgeMask', which is equivalent to
7973     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7974     // The select version does not introduce new UB if SrcMask is false and
7975     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
7976     VPValue *False = Plan.getVPValueOrAddLiveIn(
7977         ConstantInt::getFalse(BI->getCondition()->getType()));
7978     EdgeMask =
7979         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
7980   }
7981 
7982   return EdgeMaskCache[Edge] = EdgeMask;
7983 }
7984 
7985 void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
7986   BasicBlock *Header = OrigLoop->getHeader();
7987 
7988   // When not folding the tail, use nullptr to model all-true mask.
7989   if (!CM.foldTailByMasking()) {
7990     BlockMaskCache[Header] = nullptr;
7991     return;
7992   }
7993 
7994   // Introduce the early-exit compare IV <= BTC to form header block mask.
7995   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
7996   // constructing the desired canonical IV in the header block as its first
7997   // non-phi instructions.
7998 
7999   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8000   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8001   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8002   HeaderVPBB->insert(IV, NewInsertionPoint);
8003 
8004   VPBuilder::InsertPointGuard Guard(Builder);
8005   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8006   VPValue *BlockMask = nullptr;
8007   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8008   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8009   BlockMaskCache[Header] = BlockMask;
8010 }
8011 
8012 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8013   // Return the cached value.
8014   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8015   assert(BCEntryIt != BlockMaskCache.end() &&
8016          "Trying to access mask for block without one.");
8017   return BCEntryIt->second;
8018 }
8019 
8020 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8021   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8022   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8023   assert(OrigLoop->getHeader() != BB &&
8024          "Loop header must have cached block mask");
8025 
8026   // All-one mask is modelled as no-mask following the convention for masked
8027   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8028   VPValue *BlockMask = nullptr;
8029   // This is the block mask. We OR all incoming edges.
8030   for (auto *Predecessor : predecessors(BB)) {
8031     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8032     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8033       BlockMaskCache[BB] = EdgeMask;
8034     }
8035 
8036     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8037       BlockMask = EdgeMask;
8038       continue;
8039     }
8040 
8041     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8042   }
8043 
8044   BlockMaskCache[BB] = BlockMask;
8045 }
8046 
8047 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8048                                                 ArrayRef<VPValue *> Operands,
8049                                                 VFRange &Range,
8050                                                 VPlanPtr &Plan) {
8051   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8052          "Must be called with either a load or store");
8053 
8054   auto willWiden = [&](ElementCount VF) -> bool {
8055     LoopVectorizationCostModel::InstWidening Decision =
8056         CM.getWideningDecision(I, VF);
8057     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8058            "CM decision should be taken at this point.");
8059     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8060       return true;
8061     if (CM.isScalarAfterVectorization(I, VF) ||
8062         CM.isProfitableToScalarize(I, VF))
8063       return false;
8064     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8065   };
8066 
8067   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8068     return nullptr;
8069 
8070   VPValue *Mask = nullptr;
8071   if (Legal->isMaskRequired(I))
8072     Mask = getBlockInMask(I->getParent());
8073 
8074   // Determine if the pointer operand of the access is either consecutive or
8075   // reverse consecutive.
8076   LoopVectorizationCostModel::InstWidening Decision =
8077       CM.getWideningDecision(I, Range.Start);
8078   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8079   bool Consecutive =
8080       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8081 
8082   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8083   if (Consecutive) {
8084     auto *GEP = dyn_cast<GetElementPtrInst>(
8085         Ptr->getUnderlyingValue()->stripPointerCasts());
8086     auto *VectorPtr = new VPVectorPointerRecipe(
8087         Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8088         I->getDebugLoc());
8089     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8090     Ptr = VectorPtr;
8091   }
8092   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8093     return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8094                                               Reverse);
8095 
8096   StoreInst *Store = cast<StoreInst>(I);
8097   return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8098                                             Consecutive, Reverse);
8099 }
8100 
8101 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8102 /// insert a recipe to expand the step for the induction recipe.
8103 static VPWidenIntOrFpInductionRecipe *
8104 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8105                             VPValue *Start, const InductionDescriptor &IndDesc,
8106                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8107                             VFRange &Range) {
8108   assert(IndDesc.getStartValue() ==
8109          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8110   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8111          "step must be loop invariant");
8112 
8113   VPValue *Step =
8114       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8115   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8116     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8117   }
8118   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8119   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8120 }
8121 
8122 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8123     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8124 
8125   // Check if this is an integer or fp induction. If so, build the recipe that
8126   // produces its scalar and vector values.
8127   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8128     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8129                                        *PSE.getSE(), *OrigLoop, Range);
8130 
8131   // Check if this is pointer induction. If so, build the recipe for it.
8132   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8133     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8134                                                            *PSE.getSE());
8135     return new VPWidenPointerInductionRecipe(
8136         Phi, Operands[0], Step, *II,
8137         LoopVectorizationPlanner::getDecisionAndClampRange(
8138             [&](ElementCount VF) {
8139               return CM.isScalarAfterVectorization(Phi, VF);
8140             },
8141             Range));
8142   }
8143   return nullptr;
8144 }
8145 
8146 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8147     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8148   // Optimize the special case where the source is a constant integer
8149   // induction variable. Notice that we can only optimize the 'trunc' case
8150   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8151   // (c) other casts depend on pointer size.
8152 
8153   // Determine whether \p K is a truncation based on an induction variable that
8154   // can be optimized.
8155   auto isOptimizableIVTruncate =
8156       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8157     return [=](ElementCount VF) -> bool {
8158       return CM.isOptimizableIVTruncate(K, VF);
8159     };
8160   };
8161 
8162   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8163           isOptimizableIVTruncate(I), Range)) {
8164 
8165     auto *Phi = cast<PHINode>(I->getOperand(0));
8166     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8167     VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8168     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8169                                        *OrigLoop, Range);
8170   }
8171   return nullptr;
8172 }
8173 
8174 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8175                                                 ArrayRef<VPValue *> Operands,
8176                                                 VPlanPtr &Plan) {
8177   // If all incoming values are equal, the incoming VPValue can be used directly
8178   // instead of creating a new VPBlendRecipe.
8179   if (llvm::all_equal(Operands))
8180     return Operands[0];
8181 
8182   unsigned NumIncoming = Phi->getNumIncomingValues();
8183   // For in-loop reductions, we do not need to create an additional select.
8184   VPValue *InLoopVal = nullptr;
8185   for (unsigned In = 0; In < NumIncoming; In++) {
8186     PHINode *PhiOp =
8187         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8188     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8189       assert(!InLoopVal && "Found more than one in-loop reduction!");
8190       InLoopVal = Operands[In];
8191     }
8192   }
8193 
8194   assert((!InLoopVal || NumIncoming == 2) &&
8195          "Found an in-loop reduction for PHI with unexpected number of "
8196          "incoming values");
8197   if (InLoopVal)
8198     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8199 
8200   // We know that all PHIs in non-header blocks are converted into selects, so
8201   // we don't have to worry about the insertion order and we can just use the
8202   // builder. At this point we generate the predication tree. There may be
8203   // duplications since this is a simple recursive scan, but future
8204   // optimizations will clean it up.
8205   SmallVector<VPValue *, 2> OperandsWithMask;
8206 
8207   for (unsigned In = 0; In < NumIncoming; In++) {
8208     VPValue *EdgeMask =
8209         createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
8210     assert((EdgeMask || NumIncoming == 1) &&
8211            "Multiple predecessors with one having a full mask");
8212     OperandsWithMask.push_back(Operands[In]);
8213     if (EdgeMask)
8214       OperandsWithMask.push_back(EdgeMask);
8215   }
8216   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8217 }
8218 
8219 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8220                                                    ArrayRef<VPValue *> Operands,
8221                                                    VFRange &Range,
8222                                                    VPlanPtr &Plan) {
8223   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8224       [this, CI](ElementCount VF) {
8225         return CM.isScalarWithPredication(CI, VF);
8226       },
8227       Range);
8228 
8229   if (IsPredicated)
8230     return nullptr;
8231 
8232   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8233   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8234              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8235              ID == Intrinsic::pseudoprobe ||
8236              ID == Intrinsic::experimental_noalias_scope_decl))
8237     return nullptr;
8238 
8239   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8240 
8241   // Is it beneficial to perform intrinsic call compared to lib call?
8242   bool ShouldUseVectorIntrinsic =
8243       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8244                 [&](ElementCount VF) -> bool {
8245                   return CM.getCallWideningDecision(CI, VF).Kind ==
8246                          LoopVectorizationCostModel::CM_IntrinsicCall;
8247                 },
8248                 Range);
8249   if (ShouldUseVectorIntrinsic)
8250     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
8251 
8252   Function *Variant = nullptr;
8253   std::optional<unsigned> MaskPos;
8254   // Is better to call a vectorized version of the function than to to scalarize
8255   // the call?
8256   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8257       [&](ElementCount VF) -> bool {
8258         // The following case may be scalarized depending on the VF.
8259         // The flag shows whether we can use a usual Call for vectorized
8260         // version of the instruction.
8261 
8262         // If we've found a variant at a previous VF, then stop looking. A
8263         // vectorized variant of a function expects input in a certain shape
8264         // -- basically the number of input registers, the number of lanes
8265         // per register, and whether there's a mask required.
8266         // We store a pointer to the variant in the VPWidenCallRecipe, so
8267         // once we have an appropriate variant it's only valid for that VF.
8268         // This will force a different vplan to be generated for each VF that
8269         // finds a valid variant.
8270         if (Variant)
8271           return false;
8272         LoopVectorizationCostModel::CallWideningDecision Decision =
8273             CM.getCallWideningDecision(CI, VF);
8274         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8275           Variant = Decision.Variant;
8276           MaskPos = Decision.MaskPos;
8277           return true;
8278         }
8279 
8280         return false;
8281       },
8282       Range);
8283   if (ShouldUseVectorCall) {
8284     if (MaskPos.has_value()) {
8285       // We have 2 cases that would require a mask:
8286       //   1) The block needs to be predicated, either due to a conditional
8287       //      in the scalar loop or use of an active lane mask with
8288       //      tail-folding, and we use the appropriate mask for the block.
8289       //   2) No mask is required for the block, but the only available
8290       //      vector variant at this VF requires a mask, so we synthesize an
8291       //      all-true mask.
8292       VPValue *Mask = nullptr;
8293       if (Legal->isMaskRequired(CI))
8294         Mask = getBlockInMask(CI->getParent());
8295       else
8296         Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8297             IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8298 
8299       Ops.insert(Ops.begin() + *MaskPos, Mask);
8300     }
8301 
8302     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8303                                  Intrinsic::not_intrinsic, Variant);
8304   }
8305 
8306   return nullptr;
8307 }
8308 
8309 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8310   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8311          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8312   // Instruction should be widened, unless it is scalar after vectorization,
8313   // scalarization is profitable or it is predicated.
8314   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8315     return CM.isScalarAfterVectorization(I, VF) ||
8316            CM.isProfitableToScalarize(I, VF) ||
8317            CM.isScalarWithPredication(I, VF);
8318   };
8319   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8320                                                              Range);
8321 }
8322 
8323 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8324                                           ArrayRef<VPValue *> Operands,
8325                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
8326   switch (I->getOpcode()) {
8327   default:
8328     return nullptr;
8329   case Instruction::SDiv:
8330   case Instruction::UDiv:
8331   case Instruction::SRem:
8332   case Instruction::URem: {
8333     // If not provably safe, use a select to form a safe divisor before widening the
8334     // div/rem operation itself.  Otherwise fall through to general handling below.
8335     if (CM.isPredicatedInst(I)) {
8336       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8337       VPValue *Mask = getBlockInMask(I->getParent());
8338       VPValue *One = Plan->getVPValueOrAddLiveIn(
8339           ConstantInt::get(I->getType(), 1u, false));
8340       auto *SafeRHS =
8341          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8342                            I->getDebugLoc());
8343       VPBB->appendRecipe(SafeRHS);
8344       Ops[1] = SafeRHS;
8345       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8346     }
8347     [[fallthrough]];
8348   }
8349   case Instruction::Add:
8350   case Instruction::And:
8351   case Instruction::AShr:
8352   case Instruction::FAdd:
8353   case Instruction::FCmp:
8354   case Instruction::FDiv:
8355   case Instruction::FMul:
8356   case Instruction::FNeg:
8357   case Instruction::FRem:
8358   case Instruction::FSub:
8359   case Instruction::ICmp:
8360   case Instruction::LShr:
8361   case Instruction::Mul:
8362   case Instruction::Or:
8363   case Instruction::Select:
8364   case Instruction::Shl:
8365   case Instruction::Sub:
8366   case Instruction::Xor:
8367   case Instruction::Freeze:
8368     return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8369   };
8370 }
8371 
8372 void VPRecipeBuilder::fixHeaderPhis() {
8373   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8374   for (VPHeaderPHIRecipe *R : PhisToFix) {
8375     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8376     VPRecipeBase *IncR =
8377         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8378     R->addOperand(IncR->getVPSingleValue());
8379   }
8380 }
8381 
8382 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
8383                                                        VFRange &Range,
8384                                                        VPlan &Plan) {
8385   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8386       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8387       Range);
8388 
8389   bool IsPredicated = CM.isPredicatedInst(I);
8390 
8391   // Even if the instruction is not marked as uniform, there are certain
8392   // intrinsic calls that can be effectively treated as such, so we check for
8393   // them here. Conservatively, we only do this for scalable vectors, since
8394   // for fixed-width VFs we can always fall back on full scalarization.
8395   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8396     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8397     case Intrinsic::assume:
8398     case Intrinsic::lifetime_start:
8399     case Intrinsic::lifetime_end:
8400       // For scalable vectors if one of the operands is variant then we still
8401       // want to mark as uniform, which will generate one instruction for just
8402       // the first lane of the vector. We can't scalarize the call in the same
8403       // way as for fixed-width vectors because we don't know how many lanes
8404       // there are.
8405       //
8406       // The reasons for doing it this way for scalable vectors are:
8407       //   1. For the assume intrinsic generating the instruction for the first
8408       //      lane is still be better than not generating any at all. For
8409       //      example, the input may be a splat across all lanes.
8410       //   2. For the lifetime start/end intrinsics the pointer operand only
8411       //      does anything useful when the input comes from a stack object,
8412       //      which suggests it should always be uniform. For non-stack objects
8413       //      the effect is to poison the object, which still allows us to
8414       //      remove the call.
8415       IsUniform = true;
8416       break;
8417     default:
8418       break;
8419     }
8420   }
8421   VPValue *BlockInMask = nullptr;
8422   if (!IsPredicated) {
8423     // Finalize the recipe for Instr, first if it is not predicated.
8424     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8425   } else {
8426     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8427     // Instructions marked for predication are replicated and a mask operand is
8428     // added initially. Masked replicate recipes will later be placed under an
8429     // if-then construct to prevent side-effects. Generate recipes to compute
8430     // the block mask for this region.
8431     BlockInMask = getBlockInMask(I->getParent());
8432   }
8433 
8434   auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
8435                                        IsUniform, BlockInMask);
8436   return toVPRecipeResult(Recipe);
8437 }
8438 
8439 VPRecipeOrVPValueTy
8440 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8441                                         ArrayRef<VPValue *> Operands,
8442                                         VFRange &Range, VPBasicBlock *VPBB,
8443                                         VPlanPtr &Plan) {
8444   // First, check for specific widening recipes that deal with inductions, Phi
8445   // nodes, calls and memory operations.
8446   VPRecipeBase *Recipe;
8447   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8448     if (Phi->getParent() != OrigLoop->getHeader())
8449       return tryToBlend(Phi, Operands, Plan);
8450 
8451     // Always record recipes for header phis. Later first-order recurrence phis
8452     // can have earlier phis as incoming values.
8453     recordRecipeOf(Phi);
8454 
8455     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8456       return toVPRecipeResult(Recipe);
8457 
8458     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8459     assert((Legal->isReductionVariable(Phi) ||
8460             Legal->isFixedOrderRecurrence(Phi)) &&
8461            "can only widen reductions and fixed-order recurrences here");
8462     VPValue *StartV = Operands[0];
8463     if (Legal->isReductionVariable(Phi)) {
8464       const RecurrenceDescriptor &RdxDesc =
8465           Legal->getReductionVars().find(Phi)->second;
8466       assert(RdxDesc.getRecurrenceStartValue() ==
8467              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8468       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8469                                            CM.isInLoopReduction(Phi),
8470                                            CM.useOrderedReductions(RdxDesc));
8471     } else {
8472       // TODO: Currently fixed-order recurrences are modeled as chains of
8473       // first-order recurrences. If there are no users of the intermediate
8474       // recurrences in the chain, the fixed order recurrence should be modeled
8475       // directly, enabling more efficient codegen.
8476       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8477     }
8478 
8479     // Record the incoming value from the backedge, so we can add the incoming
8480     // value from the backedge after all recipes have been created.
8481     auto *Inc = cast<Instruction>(
8482         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8483     auto RecipeIter = Ingredient2Recipe.find(Inc);
8484     if (RecipeIter == Ingredient2Recipe.end())
8485       recordRecipeOf(Inc);
8486 
8487     PhisToFix.push_back(PhiRecipe);
8488     return toVPRecipeResult(PhiRecipe);
8489   }
8490 
8491   if (isa<TruncInst>(Instr) &&
8492       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8493                                                Range, *Plan)))
8494     return toVPRecipeResult(Recipe);
8495 
8496   // All widen recipes below deal only with VF > 1.
8497   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8498           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8499     return nullptr;
8500 
8501   if (auto *CI = dyn_cast<CallInst>(Instr))
8502     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
8503 
8504   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8505     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8506 
8507   if (!shouldWiden(Instr, Range))
8508     return nullptr;
8509 
8510   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8511     return toVPRecipeResult(new VPWidenGEPRecipe(
8512         GEP, make_range(Operands.begin(), Operands.end())));
8513 
8514   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8515     return toVPRecipeResult(new VPWidenSelectRecipe(
8516         *SI, make_range(Operands.begin(), Operands.end())));
8517   }
8518 
8519   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8520     return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
8521                                                   CI->getType(), *CI));
8522   }
8523 
8524   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8525 }
8526 
8527 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8528                                                         ElementCount MaxVF) {
8529   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8530 
8531   auto MaxVFTimes2 = MaxVF * 2;
8532   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8533     VFRange SubRange = {VF, MaxVFTimes2};
8534     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8535       // Now optimize the initial VPlan.
8536       if (!Plan->hasVF(ElementCount::getFixed(1)))
8537         VPlanTransforms::truncateToMinimalBitwidths(
8538             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8539       VPlanTransforms::optimize(*Plan, *PSE.getSE());
8540       assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
8541       VPlans.push_back(std::move(Plan));
8542     }
8543     VF = SubRange.End;
8544   }
8545 }
8546 
8547 // Add the necessary canonical IV and branch recipes required to control the
8548 // loop.
8549 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8550                                   DebugLoc DL) {
8551   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8552   auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
8553 
8554   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8555   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8556   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8557   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8558   Header->insert(CanonicalIVPHI, Header->begin());
8559 
8560   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8561   // IV by VF * UF.
8562   auto *CanonicalIVIncrement =
8563       new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
8564                         {HasNUW, false}, DL, "index.next");
8565   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8566 
8567   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8568   EB->appendRecipe(CanonicalIVIncrement);
8569 
8570   // Add the BranchOnCount VPInstruction to the latch.
8571   VPInstruction *BranchBack =
8572       new VPInstruction(VPInstruction::BranchOnCount,
8573                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8574   EB->appendRecipe(BranchBack);
8575 }
8576 
8577 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8578 // original exit block.
8579 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8580                                 VPlan &Plan) {
8581   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8582   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8583   // Only handle single-exit loops with unique exit blocks for now.
8584   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8585     return;
8586 
8587   // Introduce VPUsers modeling the exit values.
8588   for (PHINode &ExitPhi : ExitBB->phis()) {
8589     Value *IncomingValue =
8590         ExitPhi.getIncomingValueForBlock(ExitingBB);
8591     VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
8592     Plan.addLiveOut(&ExitPhi, V);
8593   }
8594 }
8595 
8596 VPlanPtr
8597 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8598 
8599   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8600 
8601   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8602 
8603   // ---------------------------------------------------------------------------
8604   // Pre-construction: record ingredients whose recipes we'll need to further
8605   // process after constructing the initial VPlan.
8606   // ---------------------------------------------------------------------------
8607 
8608   // For each interleave group which is relevant for this (possibly trimmed)
8609   // Range, add it to the set of groups to be later applied to the VPlan and add
8610   // placeholders for its members' Recipes which we'll be replacing with a
8611   // single VPInterleaveRecipe.
8612   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8613     auto applyIG = [IG, this](ElementCount VF) -> bool {
8614       bool Result = (VF.isVector() && // Query is illegal for VF == 1
8615                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
8616                          LoopVectorizationCostModel::CM_Interleave);
8617       // For scalable vectors, the only interleave factor currently supported
8618       // is 2 since we require the (de)interleave2 intrinsics instead of
8619       // shufflevectors.
8620       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8621              "Unsupported interleave factor for scalable vectors");
8622       return Result;
8623     };
8624     if (!getDecisionAndClampRange(applyIG, Range))
8625       continue;
8626     InterleaveGroups.insert(IG);
8627     for (unsigned i = 0; i < IG->getFactor(); i++)
8628       if (Instruction *Member = IG->getMember(i))
8629         RecipeBuilder.recordRecipeOf(Member);
8630   };
8631 
8632   // ---------------------------------------------------------------------------
8633   // Build initial VPlan: Scan the body of the loop in a topological order to
8634   // visit each basic block after having visited its predecessor basic blocks.
8635   // ---------------------------------------------------------------------------
8636 
8637   // Create initial VPlan skeleton, having a basic block for the pre-header
8638   // which contains SCEV expansions that need to happen before the CFG is
8639   // modified; a basic block for the vector pre-header, followed by a region for
8640   // the vector loop, followed by the middle basic block. The skeleton vector
8641   // loop region contains a header and latch basic blocks.
8642   VPlanPtr Plan = VPlan::createInitialVPlan(
8643       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8644       *PSE.getSE());
8645   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8646   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8647   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8648   Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8649   Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8650 
8651   // Don't use getDecisionAndClampRange here, because we don't know the UF
8652   // so this function is better to be conservative, rather than to split
8653   // it up into different VPlans.
8654   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8655   bool IVUpdateMayOverflow = false;
8656   for (ElementCount VF : Range)
8657     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8658 
8659   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8660   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8661   // When not folding the tail, we know that the induction increment will not
8662   // overflow.
8663   bool HasNUW = Style == TailFoldingStyle::None;
8664   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8665 
8666   // Scan the body of the loop in a topological order to visit each basic block
8667   // after having visited its predecessor basic blocks.
8668   LoopBlocksDFS DFS(OrigLoop);
8669   DFS.perform(LI);
8670 
8671   VPBasicBlock *VPBB = HeaderVPBB;
8672   bool NeedsMasks = CM.foldTailByMasking() ||
8673                     any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
8674                       return Legal->blockNeedsPredication(BB);
8675                     });
8676   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8677     // Relevant instructions from basic block BB will be grouped into VPRecipe
8678     // ingredients and fill a new VPBasicBlock.
8679     if (VPBB != HeaderVPBB)
8680       VPBB->setName(BB->getName());
8681     Builder.setInsertPoint(VPBB);
8682 
8683     if (VPBB == HeaderVPBB)
8684       RecipeBuilder.createHeaderMask(*Plan);
8685     else if (NeedsMasks)
8686       RecipeBuilder.createBlockInMask(BB, *Plan);
8687 
8688     // Introduce each ingredient into VPlan.
8689     // TODO: Model and preserve debug intrinsics in VPlan.
8690     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8691       Instruction *Instr = &I;
8692       SmallVector<VPValue *, 4> Operands;
8693       auto *Phi = dyn_cast<PHINode>(Instr);
8694       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8695         Operands.push_back(Plan->getVPValueOrAddLiveIn(
8696             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8697       } else {
8698         auto OpRange = Plan->mapToVPValues(Instr->operands());
8699         Operands = {OpRange.begin(), OpRange.end()};
8700       }
8701 
8702       // Invariant stores inside loop will be deleted and a single store
8703       // with the final reduction value will be added to the exit block
8704       StoreInst *SI;
8705       if ((SI = dyn_cast<StoreInst>(&I)) &&
8706           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8707         continue;
8708 
8709       auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8710           Instr, Operands, Range, VPBB, Plan);
8711       if (!RecipeOrValue)
8712         RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
8713       // If Instr can be simplified to an existing VPValue, use it.
8714       if (isa<VPValue *>(RecipeOrValue)) {
8715         auto *VPV = cast<VPValue *>(RecipeOrValue);
8716         Plan->addVPValue(Instr, VPV);
8717         // If the re-used value is a recipe, register the recipe for the
8718         // instruction, in case the recipe for Instr needs to be recorded.
8719         if (VPRecipeBase *R = VPV->getDefiningRecipe())
8720           RecipeBuilder.setRecipe(Instr, R);
8721         continue;
8722       }
8723       // Otherwise, add the new recipe.
8724       VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
8725       for (auto *Def : Recipe->definedValues()) {
8726         auto *UV = Def->getUnderlyingValue();
8727         Plan->addVPValue(UV, Def);
8728       }
8729 
8730       RecipeBuilder.setRecipe(Instr, Recipe);
8731       if (isa<VPHeaderPHIRecipe>(Recipe)) {
8732         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8733         // the following cases, VPHeaderPHIRecipes may be created after non-phi
8734         // recipes and need to be moved to the phi section of HeaderVPBB:
8735         // * tail-folding (non-phi recipes computing the header mask are
8736         // introduced earlier than regular header phi recipes, and should appear
8737         // after them)
8738         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8739 
8740         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8741                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8742                "unexpected recipe needs moving");
8743         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8744       } else
8745         VPBB->appendRecipe(Recipe);
8746     }
8747 
8748     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8749     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8750   }
8751 
8752   // After here, VPBB should not be used.
8753   VPBB = nullptr;
8754 
8755   if (CM.requiresScalarEpilogue(Range)) {
8756     // No edge from the middle block to the unique exit block has been inserted
8757     // and there is nothing to fix from vector loop; phis should have incoming
8758     // from scalar loop only.
8759   } else
8760     addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
8761 
8762   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8763          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8764          "entry block must be set to a VPRegionBlock having a non-empty entry "
8765          "VPBasicBlock");
8766   RecipeBuilder.fixHeaderPhis();
8767 
8768   // ---------------------------------------------------------------------------
8769   // Transform initial VPlan: Apply previously taken decisions, in order, to
8770   // bring the VPlan to its final state.
8771   // ---------------------------------------------------------------------------
8772 
8773   // Adjust the recipes for any inloop reductions.
8774   adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8775 
8776   // Interleave memory: for each Interleave Group we marked earlier as relevant
8777   // for this VPlan, replace the Recipes widening its memory instructions with a
8778   // single VPInterleaveRecipe at its insertion point.
8779   for (const auto *IG : InterleaveGroups) {
8780     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8781         RecipeBuilder.getRecipe(IG->getInsertPos()));
8782     SmallVector<VPValue *, 4> StoredValues;
8783     for (unsigned i = 0; i < IG->getFactor(); ++i)
8784       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8785         auto *StoreR =
8786             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8787         StoredValues.push_back(StoreR->getStoredValue());
8788       }
8789 
8790     bool NeedsMaskForGaps =
8791         IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8792     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8793                                         Recipe->getMask(), NeedsMaskForGaps);
8794     VPIG->insertBefore(Recipe);
8795     unsigned J = 0;
8796     for (unsigned i = 0; i < IG->getFactor(); ++i)
8797       if (Instruction *Member = IG->getMember(i)) {
8798         VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8799         if (!Member->getType()->isVoidTy()) {
8800           VPValue *OriginalV = MemberR->getVPSingleValue();
8801           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8802           J++;
8803         }
8804         MemberR->eraseFromParent();
8805       }
8806   }
8807 
8808   for (ElementCount VF : Range)
8809     Plan->addVF(VF);
8810   Plan->setName("Initial VPlan");
8811 
8812   // Replace VPValues for known constant strides guaranteed by predicate scalar
8813   // evolution.
8814   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8815     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8816     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8817     // Only handle constant strides for now.
8818     if (!ScevStride)
8819       continue;
8820     Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8821 
8822     auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
8823     // The versioned value may not be used in the loop directly, so just add a
8824     // new live-in in those cases.
8825     Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8826   }
8827 
8828   // From this point onwards, VPlan-to-VPlan transformations may change the plan
8829   // in ways that accessing values using original IR values is incorrect.
8830   Plan->disableValue2VPValue();
8831 
8832   // Sink users of fixed-order recurrence past the recipe defining the previous
8833   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8834   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8835     return nullptr;
8836 
8837   if (useActiveLaneMask(Style)) {
8838     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8839     // TailFoldingStyle is visible there.
8840     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8841     bool WithoutRuntimeCheck =
8842         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8843     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8844                                        WithoutRuntimeCheck);
8845   }
8846   return Plan;
8847 }
8848 
8849 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8850   // Outer loop handling: They may require CFG and instruction level
8851   // transformations before even evaluating whether vectorization is profitable.
8852   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8853   // the vectorization pipeline.
8854   assert(!OrigLoop->isInnermost());
8855   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8856 
8857   // Create new empty VPlan
8858   auto Plan = VPlan::createInitialVPlan(
8859       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8860       *PSE.getSE());
8861 
8862   // Build hierarchical CFG
8863   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8864   HCFGBuilder.buildHierarchicalCFG();
8865 
8866   for (ElementCount VF : Range)
8867     Plan->addVF(VF);
8868 
8869   VPlanTransforms::VPInstructionsToVPRecipes(
8870       Plan,
8871       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8872       *PSE.getSE(), *TLI);
8873 
8874   // Remove the existing terminator of the exiting block of the top-most region.
8875   // A BranchOnCount will be added instead when adding the canonical IV recipes.
8876   auto *Term =
8877       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8878   Term->eraseFromParent();
8879 
8880   // Tail folding is not supported for outer loops, so the induction increment
8881   // is guaranteed to not wrap.
8882   bool HasNUW = true;
8883   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8884                         DebugLoc());
8885   return Plan;
8886 }
8887 
8888 // Adjust the recipes for reductions. For in-loop reductions the chain of
8889 // instructions leading from the loop exit instr to the phi need to be converted
8890 // to reductions, with one operand being vector and the other being the scalar
8891 // reduction chain. For other reductions, a select is introduced between the phi
8892 // and live-out recipes when folding the tail.
8893 //
8894 // A ComputeReductionResult recipe is added to the middle block, also for
8895 // in-loop reductions which compute their result in-loop, because generating
8896 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8897 void LoopVectorizationPlanner::adjustRecipesForReductions(
8898     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8899     ElementCount MinVF) {
8900   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8901   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8902   // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8903   // sank outside of the loop would keep the same order as they had in the
8904   // original loop.
8905   SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8906   for (VPRecipeBase &R : Header->phis()) {
8907     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8908       ReductionPHIList.emplace_back(ReductionPhi);
8909   }
8910   bool HasIntermediateStore = false;
8911   stable_sort(ReductionPHIList,
8912               [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8913                                             const VPReductionPHIRecipe *R2) {
8914                 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8915                 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8916                 HasIntermediateStore |= IS1 || IS2;
8917 
8918                 // If neither of the recipes has an intermediate store, keep the
8919                 // order the same.
8920                 if (!IS1 && !IS2)
8921                   return false;
8922 
8923                 // If only one of the recipes has an intermediate store, then
8924                 // move it towards the beginning of the list.
8925                 if (IS1 && !IS2)
8926                   return true;
8927 
8928                 if (!IS1 && IS2)
8929                   return false;
8930 
8931                 // If both recipes have an intermediate store, then the recipe
8932                 // with the later store should be processed earlier. So it
8933                 // should go to the beginning of the list.
8934                 return DT->dominates(IS2, IS1);
8935               });
8936 
8937   if (HasIntermediateStore && ReductionPHIList.size() > 1)
8938     for (VPRecipeBase *R : ReductionPHIList)
8939       R->moveBefore(*Header, Header->getFirstNonPhi());
8940 
8941   for (VPRecipeBase &R : Header->phis()) {
8942     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8943     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8944       continue;
8945 
8946     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8947     RecurKind Kind = RdxDesc.getRecurrenceKind();
8948     assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8949            "AnyOf reductions are not allowed for in-loop reductions");
8950 
8951     // Collect the chain of "link" recipes for the reduction starting at PhiR.
8952     SetVector<VPRecipeBase *> Worklist;
8953     Worklist.insert(PhiR);
8954     for (unsigned I = 0; I != Worklist.size(); ++I) {
8955       VPRecipeBase *Cur = Worklist[I];
8956       for (VPUser *U : Cur->getVPSingleValue()->users()) {
8957         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
8958         if (!UserRecipe)
8959           continue;
8960         assert(UserRecipe->getNumDefinedValues() == 1 &&
8961                "recipes must define exactly one result value");
8962         Worklist.insert(UserRecipe);
8963       }
8964     }
8965 
8966     // Visit operation "Links" along the reduction chain top-down starting from
8967     // the phi until LoopExitValue. We keep track of the previous item
8968     // (PreviousLink) to tell which of the two operands of a Link will remain
8969     // scalar and which will be reduced. For minmax by select(cmp), Link will be
8970     // the select instructions.
8971     VPRecipeBase *PreviousLink = PhiR; // Aka Worklist[0].
8972     for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) {
8973       VPValue *PreviousLinkV = PreviousLink->getVPSingleValue();
8974 
8975       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8976 
8977       // Index of the first operand which holds a non-mask vector operand.
8978       unsigned IndexOfFirstOperand;
8979       // Recognize a call to the llvm.fmuladd intrinsic.
8980       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8981       VPValue *VecOp;
8982       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8983       if (IsFMulAdd) {
8984         assert(
8985             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
8986             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8987         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8988                 isa<VPWidenCallRecipe>(CurrentLink)) &&
8989                CurrentLink->getOperand(2) == PreviousLinkV &&
8990                "expected a call where the previous link is the added operand");
8991 
8992         // If the instruction is a call to the llvm.fmuladd intrinsic then we
8993         // need to create an fmul recipe (multiplying the first two operands of
8994         // the fmuladd together) to use as the vector operand for the fadd
8995         // reduction.
8996         VPInstruction *FMulRecipe = new VPInstruction(
8997             Instruction::FMul,
8998             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8999             CurrentLinkI->getFastMathFlags());
9000         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9001         VecOp = FMulRecipe;
9002       } else {
9003         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9004           if (isa<VPWidenRecipe>(CurrentLink)) {
9005             assert(isa<CmpInst>(CurrentLinkI) &&
9006                    "need to have the compare of the select");
9007             continue;
9008           }
9009           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9010                  "must be a select recipe");
9011           IndexOfFirstOperand = 1;
9012         } else {
9013           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9014                  "Expected to replace a VPWidenSC");
9015           IndexOfFirstOperand = 0;
9016         }
9017         // Note that for non-commutable operands (cmp-selects), the semantics of
9018         // the cmp-select are captured in the recurrence kind.
9019         unsigned VecOpId =
9020             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLinkV
9021                 ? IndexOfFirstOperand + 1
9022                 : IndexOfFirstOperand;
9023         VecOp = CurrentLink->getOperand(VecOpId);
9024         assert(VecOp != PreviousLinkV &&
9025                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9026                                        (VecOpId - IndexOfFirstOperand)) ==
9027                    PreviousLinkV &&
9028                "PreviousLinkV must be the operand other than VecOp");
9029       }
9030 
9031       BasicBlock *BB = CurrentLinkI->getParent();
9032       VPValue *CondOp = nullptr;
9033       if (CM.blockNeedsPredicationForAnyReason(BB)) {
9034         VPBuilder::InsertPointGuard Guard(Builder);
9035         Builder.setInsertPoint(CurrentLink);
9036         CondOp = RecipeBuilder.getBlockInMask(BB);
9037       }
9038 
9039       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9040           RdxDesc, CurrentLinkI, PreviousLinkV, VecOp, CondOp);
9041       // Append the recipe to the end of the VPBasicBlock because we need to
9042       // ensure that it comes after all of it's inputs, including CondOp.
9043       // Note that this transformation may leave over dead recipes (including
9044       // CurrentLink), which will be cleaned by a later VPlan transform.
9045       LinkVPBB->appendRecipe(RedRecipe);
9046       CurrentLink->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9047       PreviousLink = RedRecipe;
9048     }
9049   }
9050     Builder.setInsertPoint(&*LatchVPBB->begin());
9051     for (VPRecipeBase &R :
9052          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9053     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9054     if (!PhiR)
9055       continue;
9056 
9057     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9058     // If tail is folded by masking, introduce selects between the phi
9059     // and the live-out instruction of each reduction, at the beginning of the
9060     // dedicated latch block.
9061     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9062     auto *NewExitingVPV = PhiR->getBackedgeValue();
9063     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9064       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9065       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9066              "reduction recipe must be defined before latch");
9067       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9068       std::optional<FastMathFlags> FMFs =
9069           PhiTy->isFloatingPointTy()
9070               ? std::make_optional(RdxDesc.getFastMathFlags())
9071               : std::nullopt;
9072       NewExitingVPV =
9073           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9074       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9075         return isa<VPInstruction>(&U) &&
9076                cast<VPInstruction>(&U)->getOpcode() ==
9077                    VPInstruction::ComputeReductionResult;
9078       });
9079       if (PreferPredicatedReductionSelect ||
9080           TTI.preferPredicatedReductionSelect(
9081               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9082               TargetTransformInfo::ReductionFlags()))
9083         PhiR->setOperand(1, NewExitingVPV);
9084     }
9085 
9086     // If the vector reduction can be performed in a smaller type, we truncate
9087     // then extend the loop exit value to enable InstCombine to evaluate the
9088     // entire expression in the smaller type.
9089     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9090     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9091       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9092       Type *RdxTy = RdxDesc.getRecurrenceType();
9093       auto *Trunc =
9094           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9095       auto *Extnd =
9096           RdxDesc.isSigned()
9097               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9098               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9099 
9100       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9101       Extnd->insertAfter(Trunc);
9102       if (PhiR->getOperand(1) == NewExitingVPV)
9103         PhiR->setOperand(1, Extnd->getVPSingleValue());
9104       NewExitingVPV = Extnd;
9105     }
9106 
9107     // We want code in the middle block to appear to execute on the location of
9108     // the scalar loop's latch terminator because: (a) it is all compiler
9109     // generated, (b) these instructions are always executed after evaluating
9110     // the latch conditional branch, and (c) other passes may add new
9111     // predecessors which terminate on this line. This is the easiest way to
9112     // ensure we don't accidentally cause an extra step back into the loop while
9113     // debugging.
9114     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9115 
9116     // TODO: At the moment ComputeReductionResult also drives creation of the
9117     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9118     // even for in-loop reductions, until the reduction resume value handling is
9119     // also modeled in VPlan.
9120     auto *FinalReductionResult = new VPInstruction(
9121         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9122     cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9123         ->appendRecipe(FinalReductionResult);
9124     OrigExitingVPV->replaceUsesWithIf(
9125         FinalReductionResult,
9126         [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9127   }
9128 
9129   VPlanTransforms::clearReductionWrapFlags(*Plan);
9130 }
9131 
9132 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9133 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9134                                VPSlotTracker &SlotTracker) const {
9135   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9136   IG->getInsertPos()->printAsOperand(O, false);
9137   O << ", ";
9138   getAddr()->printAsOperand(O, SlotTracker);
9139   VPValue *Mask = getMask();
9140   if (Mask) {
9141     O << ", ";
9142     Mask->printAsOperand(O, SlotTracker);
9143   }
9144 
9145   unsigned OpIdx = 0;
9146   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9147     if (!IG->getMember(i))
9148       continue;
9149     if (getNumStoreOperands() > 0) {
9150       O << "\n" << Indent << "  store ";
9151       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9152       O << " to index " << i;
9153     } else {
9154       O << "\n" << Indent << "  ";
9155       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9156       O << " = load from index " << i;
9157     }
9158     ++OpIdx;
9159   }
9160 }
9161 #endif
9162 
9163 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9164   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9165          "Not a pointer induction according to InductionDescriptor!");
9166   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9167          "Unexpected type.");
9168 
9169   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9170   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9171 
9172   if (onlyScalarsGenerated(State.VF)) {
9173     // This is the normalized GEP that starts counting at zero.
9174     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9175         CanonicalIV, IndDesc.getStep()->getType());
9176     // Determine the number of scalars we need to generate for each unroll
9177     // iteration. If the instruction is uniform, we only need to generate the
9178     // first lane. Otherwise, we generate all VF values.
9179     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9180     assert((IsUniform || !State.VF.isScalable()) &&
9181            "Cannot scalarize a scalable VF");
9182     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9183 
9184     for (unsigned Part = 0; Part < State.UF; ++Part) {
9185       Value *PartStart =
9186           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9187 
9188       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9189         Value *Idx = State.Builder.CreateAdd(
9190             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9191         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9192 
9193         Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
9194         Value *SclrGep = emitTransformedIndex(
9195             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9196             IndDesc.getKind(), IndDesc.getInductionBinOp());
9197         SclrGep->setName("next.gep");
9198         State.set(this, SclrGep, VPIteration(Part, Lane));
9199       }
9200     }
9201     return;
9202   }
9203 
9204   Type *PhiType = IndDesc.getStep()->getType();
9205 
9206   // Build a pointer phi
9207   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9208   Type *ScStValueType = ScalarStartValue->getType();
9209   PHINode *NewPointerPhi =
9210       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9211 
9212   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9213   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9214 
9215   // A pointer induction, performed by using a gep
9216   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9217 
9218   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9219   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9220   Value *NumUnrolledElems =
9221       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9222   Value *InductionGEP = GetElementPtrInst::Create(
9223       State.Builder.getInt8Ty(), NewPointerPhi,
9224       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9225       InductionLoc);
9226   // Add induction update using an incorrect block temporarily. The phi node
9227   // will be fixed after VPlan execution. Note that at this point the latch
9228   // block cannot be used, as it does not exist yet.
9229   // TODO: Model increment value in VPlan, by turning the recipe into a
9230   // multi-def and a subclass of VPHeaderPHIRecipe.
9231   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9232 
9233   // Create UF many actual address geps that use the pointer
9234   // phi as base and a vectorized version of the step value
9235   // (<step*0, ..., step*N>) as offset.
9236   for (unsigned Part = 0; Part < State.UF; ++Part) {
9237     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9238     Value *StartOffsetScalar =
9239         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9240     Value *StartOffset =
9241         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9242     // Create a vector of consecutive numbers from zero to VF.
9243     StartOffset = State.Builder.CreateAdd(
9244         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9245 
9246     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9247            "scalar step must be the same across all parts");
9248     Value *GEP = State.Builder.CreateGEP(
9249         State.Builder.getInt8Ty(), NewPointerPhi,
9250         State.Builder.CreateMul(
9251             StartOffset,
9252             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9253             "vector.gep"));
9254     State.set(this, GEP, Part);
9255   }
9256 }
9257 
9258 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9259   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9260 
9261   // Fast-math-flags propagate from the original induction instruction.
9262   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9263   if (FPBinOp)
9264     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9265 
9266   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9267   Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9268   Value *DerivedIV = emitTransformedIndex(
9269       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9270       Kind, cast_if_present<BinaryOperator>(FPBinOp));
9271   DerivedIV->setName("offset.idx");
9272   if (TruncResultTy) {
9273     assert(TruncResultTy != DerivedIV->getType() &&
9274            Step->getType()->isIntegerTy() &&
9275            "Truncation requires an integer step");
9276     DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
9277   }
9278   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9279 
9280   State.set(this, DerivedIV, VPIteration(0, 0));
9281 }
9282 
9283 void VPInterleaveRecipe::execute(VPTransformState &State) {
9284   assert(!State.Instance && "Interleave group being replicated.");
9285   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9286                                       getStoredValues(), getMask(),
9287                                       NeedsMaskForGaps);
9288 }
9289 
9290 void VPReductionRecipe::execute(VPTransformState &State) {
9291   assert(!State.Instance && "Reduction being replicated.");
9292   Value *PrevInChain = State.get(getChainOp(), 0);
9293   RecurKind Kind = RdxDesc.getRecurrenceKind();
9294   bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
9295   // Propagate the fast-math flags carried by the underlying instruction.
9296   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9297   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
9298   for (unsigned Part = 0; Part < State.UF; ++Part) {
9299     Value *NewVecOp = State.get(getVecOp(), Part);
9300     if (VPValue *Cond = getCondOp()) {
9301       Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
9302                                            : State.get(Cond, {Part, 0});
9303       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
9304       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
9305       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
9306                                                   RdxDesc.getFastMathFlags());
9307       if (State.VF.isVector()) {
9308         Iden =
9309             State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9310       }
9311 
9312       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
9313       NewVecOp = Select;
9314     }
9315     Value *NewRed;
9316     Value *NextInChain;
9317     if (IsOrdered) {
9318       if (State.VF.isVector())
9319         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
9320                                         PrevInChain);
9321       else
9322         NewRed = State.Builder.CreateBinOp(
9323             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
9324             NewVecOp);
9325       PrevInChain = NewRed;
9326     } else {
9327       PrevInChain = State.get(getChainOp(), Part);
9328       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
9329     }
9330     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9331       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
9332                                    NewRed, PrevInChain);
9333     } else if (IsOrdered)
9334       NextInChain = NewRed;
9335     else
9336       NextInChain = State.Builder.CreateBinOp(
9337           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9338     State.set(this, NextInChain, Part);
9339   }
9340 }
9341 
9342 void VPReplicateRecipe::execute(VPTransformState &State) {
9343   Instruction *UI = getUnderlyingInstr();
9344   if (State.Instance) { // Generate a single instance.
9345     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9346     State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9347     // Insert scalar instance packing it into a vector.
9348     if (State.VF.isVector() && shouldPack()) {
9349       // If we're constructing lane 0, initialize to start from poison.
9350       if (State.Instance->Lane.isFirstLane()) {
9351         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9352         Value *Poison = PoisonValue::get(
9353             VectorType::get(UI->getType(), State.VF));
9354         State.set(this, Poison, State.Instance->Part);
9355       }
9356       State.packScalarIntoVectorValue(this, *State.Instance);
9357     }
9358     return;
9359   }
9360 
9361   if (IsUniform) {
9362     // If the recipe is uniform across all parts (instead of just per VF), only
9363     // generate a single instance.
9364     if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9365         all_of(operands(), [](VPValue *Op) {
9366           return Op->isDefinedOutsideVectorRegions();
9367         })) {
9368       State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9369       if (user_begin() != user_end()) {
9370         for (unsigned Part = 1; Part < State.UF; ++Part)
9371           State.set(this, State.get(this, VPIteration(0, 0)),
9372                     VPIteration(Part, 0));
9373       }
9374       return;
9375     }
9376 
9377     // Uniform within VL means we need to generate lane 0 only for each
9378     // unrolled copy.
9379     for (unsigned Part = 0; Part < State.UF; ++Part)
9380       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9381     return;
9382   }
9383 
9384   // A store of a loop varying value to a uniform address only needs the last
9385   // copy of the store.
9386   if (isa<StoreInst>(UI) &&
9387       vputils::isUniformAfterVectorization(getOperand(1))) {
9388     auto Lane = VPLane::getLastLaneForVF(State.VF);
9389     State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9390                                     State);
9391     return;
9392   }
9393 
9394   // Generate scalar instances for all VF lanes of all UF parts.
9395   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9396   const unsigned EndLane = State.VF.getKnownMinValue();
9397   for (unsigned Part = 0; Part < State.UF; ++Part)
9398     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9399       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9400 }
9401 
9402 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9403   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9404 
9405   // Attempt to issue a wide load.
9406   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9407   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9408 
9409   assert((LI || SI) && "Invalid Load/Store instruction");
9410   assert((!SI || StoredValue) && "No stored value provided for widened store");
9411   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9412 
9413   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9414 
9415   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9416   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9417   bool CreateGatherScatter = !isConsecutive();
9418 
9419   auto &Builder = State.Builder;
9420   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9421   bool isMaskRequired = getMask();
9422   if (isMaskRequired) {
9423     // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9424     // a null all-one mask is a null mask.
9425     for (unsigned Part = 0; Part < State.UF; ++Part) {
9426       Value *Mask = State.get(getMask(), Part);
9427       if (isReverse())
9428         Mask = Builder.CreateVectorReverse(Mask, "reverse");
9429       BlockInMaskParts[Part] = Mask;
9430     }
9431   }
9432 
9433   // Handle Stores:
9434   if (SI) {
9435     State.setDebugLocFrom(SI->getDebugLoc());
9436 
9437     for (unsigned Part = 0; Part < State.UF; ++Part) {
9438       Instruction *NewSI = nullptr;
9439       Value *StoredVal = State.get(StoredValue, Part);
9440       if (CreateGatherScatter) {
9441         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9442         Value *VectorGep = State.get(getAddr(), Part);
9443         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9444                                             MaskPart);
9445       } else {
9446         if (isReverse()) {
9447           // If we store to reverse consecutive memory locations, then we need
9448           // to reverse the order of elements in the stored value.
9449           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9450           // We don't want to update the value in the map as it might be used in
9451           // another expression. So don't call resetVectorValue(StoredVal).
9452         }
9453         auto *VecPtr = State.get(getAddr(), Part);
9454         if (isMaskRequired)
9455           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9456                                             BlockInMaskParts[Part]);
9457         else
9458           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9459       }
9460       State.addMetadata(NewSI, SI);
9461     }
9462     return;
9463   }
9464 
9465   // Handle loads.
9466   assert(LI && "Must have a load instruction");
9467   State.setDebugLocFrom(LI->getDebugLoc());
9468   for (unsigned Part = 0; Part < State.UF; ++Part) {
9469     Value *NewLI;
9470     if (CreateGatherScatter) {
9471       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9472       Value *VectorGep = State.get(getAddr(), Part);
9473       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9474                                          nullptr, "wide.masked.gather");
9475       State.addMetadata(NewLI, LI);
9476     } else {
9477       auto *VecPtr = State.get(getAddr(), Part);
9478       if (isMaskRequired)
9479         NewLI = Builder.CreateMaskedLoad(
9480             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9481             PoisonValue::get(DataTy), "wide.masked.load");
9482       else
9483         NewLI =
9484             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9485 
9486       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9487       State.addMetadata(NewLI, LI);
9488       if (Reverse)
9489         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9490     }
9491 
9492     State.set(getVPSingleValue(), NewLI, Part);
9493   }
9494 }
9495 
9496 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9497 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9498 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9499 // for predication.
9500 static ScalarEpilogueLowering getScalarEpilogueLowering(
9501     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9502     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9503     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9504   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9505   // don't look at hints or options, and don't request a scalar epilogue.
9506   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9507   // LoopAccessInfo (due to code dependency and not being able to reliably get
9508   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9509   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9510   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9511   // back to the old way and vectorize with versioning when forced. See D81345.)
9512   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9513                                                       PGSOQueryType::IRPass) &&
9514                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9515     return CM_ScalarEpilogueNotAllowedOptSize;
9516 
9517   // 2) If set, obey the directives
9518   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9519     switch (PreferPredicateOverEpilogue) {
9520     case PreferPredicateTy::ScalarEpilogue:
9521       return CM_ScalarEpilogueAllowed;
9522     case PreferPredicateTy::PredicateElseScalarEpilogue:
9523       return CM_ScalarEpilogueNotNeededUsePredicate;
9524     case PreferPredicateTy::PredicateOrDontVectorize:
9525       return CM_ScalarEpilogueNotAllowedUsePredicate;
9526     };
9527   }
9528 
9529   // 3) If set, obey the hints
9530   switch (Hints.getPredicate()) {
9531   case LoopVectorizeHints::FK_Enabled:
9532     return CM_ScalarEpilogueNotNeededUsePredicate;
9533   case LoopVectorizeHints::FK_Disabled:
9534     return CM_ScalarEpilogueAllowed;
9535   };
9536 
9537   // 4) if the TTI hook indicates this is profitable, request predication.
9538   TailFoldingInfo TFI(TLI, &LVL, IAI);
9539   if (TTI->preferPredicateOverEpilogue(&TFI))
9540     return CM_ScalarEpilogueNotNeededUsePredicate;
9541 
9542   return CM_ScalarEpilogueAllowed;
9543 }
9544 
9545 // Process the loop in the VPlan-native vectorization path. This path builds
9546 // VPlan upfront in the vectorization pipeline, which allows to apply
9547 // VPlan-to-VPlan transformations from the very beginning without modifying the
9548 // input LLVM IR.
9549 static bool processLoopInVPlanNativePath(
9550     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9551     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9552     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9553     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9554     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9555     LoopVectorizationRequirements &Requirements) {
9556 
9557   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9558     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9559     return false;
9560   }
9561   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9562   Function *F = L->getHeader()->getParent();
9563   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9564 
9565   ScalarEpilogueLowering SEL =
9566       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9567 
9568   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9569                                 &Hints, IAI);
9570   // Use the planner for outer loop vectorization.
9571   // TODO: CM is not used at this point inside the planner. Turn CM into an
9572   // optional argument if we don't need it in the future.
9573   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9574                                ORE);
9575 
9576   // Get user vectorization factor.
9577   ElementCount UserVF = Hints.getWidth();
9578 
9579   CM.collectElementTypesForWidening();
9580 
9581   // Plan how to best vectorize, return the best VF and its cost.
9582   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9583 
9584   // If we are stress testing VPlan builds, do not attempt to generate vector
9585   // code. Masked vector code generation support will follow soon.
9586   // Also, do not attempt to vectorize if no vector code will be produced.
9587   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9588     return false;
9589 
9590   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9591 
9592   {
9593     bool AddBranchWeights =
9594         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9595     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9596                              F->getParent()->getDataLayout(), AddBranchWeights);
9597     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9598                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9599     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9600                       << L->getHeader()->getParent()->getName() << "\"\n");
9601     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9602   }
9603 
9604   reportVectorization(ORE, L, VF, 1);
9605 
9606   // Mark the loop as already vectorized to avoid vectorizing again.
9607   Hints.setAlreadyVectorized();
9608   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9609   return true;
9610 }
9611 
9612 // Emit a remark if there are stores to floats that required a floating point
9613 // extension. If the vectorized loop was generated with floating point there
9614 // will be a performance penalty from the conversion overhead and the change in
9615 // the vector width.
9616 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9617   SmallVector<Instruction *, 4> Worklist;
9618   for (BasicBlock *BB : L->getBlocks()) {
9619     for (Instruction &Inst : *BB) {
9620       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9621         if (S->getValueOperand()->getType()->isFloatTy())
9622           Worklist.push_back(S);
9623       }
9624     }
9625   }
9626 
9627   // Traverse the floating point stores upwards searching, for floating point
9628   // conversions.
9629   SmallPtrSet<const Instruction *, 4> Visited;
9630   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9631   while (!Worklist.empty()) {
9632     auto *I = Worklist.pop_back_val();
9633     if (!L->contains(I))
9634       continue;
9635     if (!Visited.insert(I).second)
9636       continue;
9637 
9638     // Emit a remark if the floating point store required a floating
9639     // point conversion.
9640     // TODO: More work could be done to identify the root cause such as a
9641     // constant or a function return type and point the user to it.
9642     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9643       ORE->emit([&]() {
9644         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9645                                           I->getDebugLoc(), L->getHeader())
9646                << "floating point conversion changes vector width. "
9647                << "Mixed floating point precision requires an up/down "
9648                << "cast that will negatively impact performance.";
9649       });
9650 
9651     for (Use &Op : I->operands())
9652       if (auto *OpI = dyn_cast<Instruction>(Op))
9653         Worklist.push_back(OpI);
9654   }
9655 }
9656 
9657 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9658                                        VectorizationFactor &VF,
9659                                        std::optional<unsigned> VScale, Loop *L,
9660                                        ScalarEvolution &SE,
9661                                        ScalarEpilogueLowering SEL) {
9662   InstructionCost CheckCost = Checks.getCost();
9663   if (!CheckCost.isValid())
9664     return false;
9665 
9666   // When interleaving only scalar and vector cost will be equal, which in turn
9667   // would lead to a divide by 0. Fall back to hard threshold.
9668   if (VF.Width.isScalar()) {
9669     if (CheckCost > VectorizeMemoryCheckThreshold) {
9670       LLVM_DEBUG(
9671           dbgs()
9672           << "LV: Interleaving only is not profitable due to runtime checks\n");
9673       return false;
9674     }
9675     return true;
9676   }
9677 
9678   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9679   double ScalarC = *VF.ScalarCost.getValue();
9680   if (ScalarC == 0)
9681     return true;
9682 
9683   // First, compute the minimum iteration count required so that the vector
9684   // loop outperforms the scalar loop.
9685   //  The total cost of the scalar loop is
9686   //   ScalarC * TC
9687   //  where
9688   //  * TC is the actual trip count of the loop.
9689   //  * ScalarC is the cost of a single scalar iteration.
9690   //
9691   //  The total cost of the vector loop is
9692   //    RtC + VecC * (TC / VF) + EpiC
9693   //  where
9694   //  * RtC is the cost of the generated runtime checks
9695   //  * VecC is the cost of a single vector iteration.
9696   //  * TC is the actual trip count of the loop
9697   //  * VF is the vectorization factor
9698   //  * EpiCost is the cost of the generated epilogue, including the cost
9699   //    of the remaining scalar operations.
9700   //
9701   // Vectorization is profitable once the total vector cost is less than the
9702   // total scalar cost:
9703   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9704   //
9705   // Now we can compute the minimum required trip count TC as
9706   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9707   //
9708   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9709   // the computations are performed on doubles, not integers and the result
9710   // is rounded up, hence we get an upper estimate of the TC.
9711   unsigned IntVF = VF.Width.getKnownMinValue();
9712   if (VF.Width.isScalable()) {
9713     unsigned AssumedMinimumVscale = 1;
9714     if (VScale)
9715       AssumedMinimumVscale = *VScale;
9716     IntVF *= AssumedMinimumVscale;
9717   }
9718   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9719   double RtC = *CheckCost.getValue();
9720   double MinTC1 = RtC / (ScalarC - VecCOverVF);
9721 
9722   // Second, compute a minimum iteration count so that the cost of the
9723   // runtime checks is only a fraction of the total scalar loop cost. This
9724   // adds a loop-dependent bound on the overhead incurred if the runtime
9725   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9726   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9727   // cost, compute
9728   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
9729   double MinTC2 = RtC * 10 / ScalarC;
9730 
9731   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9732   // epilogue is allowed, choose the next closest multiple of VF. This should
9733   // partly compensate for ignoring the epilogue cost.
9734   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9735   if (SEL == CM_ScalarEpilogueAllowed)
9736     MinTC = alignTo(MinTC, IntVF);
9737   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9738 
9739   LLVM_DEBUG(
9740       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9741              << VF.MinProfitableTripCount << "\n");
9742 
9743   // Skip vectorization if the expected trip count is less than the minimum
9744   // required trip count.
9745   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9746     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9747                                 VF.MinProfitableTripCount)) {
9748       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9749                            "trip count < minimum profitable VF ("
9750                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
9751                         << ")\n");
9752 
9753       return false;
9754     }
9755   }
9756   return true;
9757 }
9758 
9759 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9760     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9761                                !EnableLoopInterleaving),
9762       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9763                               !EnableLoopVectorization) {}
9764 
9765 bool LoopVectorizePass::processLoop(Loop *L) {
9766   assert((EnableVPlanNativePath || L->isInnermost()) &&
9767          "VPlan-native path is not enabled. Only process inner loops.");
9768 
9769 #ifndef NDEBUG
9770   const std::string DebugLocStr = getDebugLocString(L);
9771 #endif /* NDEBUG */
9772 
9773   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9774                     << L->getHeader()->getParent()->getName() << "' from "
9775                     << DebugLocStr << "\n");
9776 
9777   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9778 
9779   LLVM_DEBUG(
9780       dbgs() << "LV: Loop hints:"
9781              << " force="
9782              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9783                      ? "disabled"
9784                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9785                             ? "enabled"
9786                             : "?"))
9787              << " width=" << Hints.getWidth()
9788              << " interleave=" << Hints.getInterleave() << "\n");
9789 
9790   // Function containing loop
9791   Function *F = L->getHeader()->getParent();
9792 
9793   // Looking at the diagnostic output is the only way to determine if a loop
9794   // was vectorized (other than looking at the IR or machine code), so it
9795   // is important to generate an optimization remark for each loop. Most of
9796   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9797   // generated as OptimizationRemark and OptimizationRemarkMissed are
9798   // less verbose reporting vectorized loops and unvectorized loops that may
9799   // benefit from vectorization, respectively.
9800 
9801   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9802     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9803     return false;
9804   }
9805 
9806   PredicatedScalarEvolution PSE(*SE, *L);
9807 
9808   // Check if it is legal to vectorize the loop.
9809   LoopVectorizationRequirements Requirements;
9810   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9811                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9812   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9813     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9814     Hints.emitRemarkWithHints();
9815     return false;
9816   }
9817 
9818   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9819   // here. They may require CFG and instruction level transformations before
9820   // even evaluating whether vectorization is profitable. Since we cannot modify
9821   // the incoming IR, we need to build VPlan upfront in the vectorization
9822   // pipeline.
9823   if (!L->isInnermost())
9824     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9825                                         ORE, BFI, PSI, Hints, Requirements);
9826 
9827   assert(L->isInnermost() && "Inner loop expected.");
9828 
9829   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9830   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9831 
9832   // If an override option has been passed in for interleaved accesses, use it.
9833   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9834     UseInterleaved = EnableInterleavedMemAccesses;
9835 
9836   // Analyze interleaved memory accesses.
9837   if (UseInterleaved)
9838     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9839 
9840   // Check the function attributes and profiles to find out if this function
9841   // should be optimized for size.
9842   ScalarEpilogueLowering SEL =
9843       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9844 
9845   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9846   // count by optimizing for size, to minimize overheads.
9847   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9848   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9849     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9850                       << "This loop is worth vectorizing only if no scalar "
9851                       << "iteration overheads are incurred.");
9852     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9853       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9854     else {
9855       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9856         LLVM_DEBUG(dbgs() << "\n");
9857         // Predicate tail-folded loops are efficient even when the loop
9858         // iteration count is low. However, setting the epilogue policy to
9859         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9860         // with runtime checks. It's more effective to let
9861         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9862         // for the loop.
9863         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9864           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9865       } else {
9866         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9867                              "small to consider vectorizing.\n");
9868         reportVectorizationFailure(
9869             "The trip count is below the minial threshold value.",
9870             "loop trip count is too low, avoiding vectorization",
9871             "LowTripCount", ORE, L);
9872         Hints.emitRemarkWithHints();
9873         return false;
9874       }
9875     }
9876   }
9877 
9878   // Check the function attributes to see if implicit floats or vectors are
9879   // allowed.
9880   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9881     reportVectorizationFailure(
9882         "Can't vectorize when the NoImplicitFloat attribute is used",
9883         "loop not vectorized due to NoImplicitFloat attribute",
9884         "NoImplicitFloat", ORE, L);
9885     Hints.emitRemarkWithHints();
9886     return false;
9887   }
9888 
9889   // Check if the target supports potentially unsafe FP vectorization.
9890   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9891   // for the target we're vectorizing for, to make sure none of the
9892   // additional fp-math flags can help.
9893   if (Hints.isPotentiallyUnsafe() &&
9894       TTI->isFPVectorizationPotentiallyUnsafe()) {
9895     reportVectorizationFailure(
9896         "Potentially unsafe FP op prevents vectorization",
9897         "loop not vectorized due to unsafe FP support.",
9898         "UnsafeFP", ORE, L);
9899     Hints.emitRemarkWithHints();
9900     return false;
9901   }
9902 
9903   bool AllowOrderedReductions;
9904   // If the flag is set, use that instead and override the TTI behaviour.
9905   if (ForceOrderedReductions.getNumOccurrences() > 0)
9906     AllowOrderedReductions = ForceOrderedReductions;
9907   else
9908     AllowOrderedReductions = TTI->enableOrderedReductions();
9909   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9910     ORE->emit([&]() {
9911       auto *ExactFPMathInst = Requirements.getExactFPInst();
9912       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9913                                                  ExactFPMathInst->getDebugLoc(),
9914                                                  ExactFPMathInst->getParent())
9915              << "loop not vectorized: cannot prove it is safe to reorder "
9916                 "floating-point operations";
9917     });
9918     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9919                          "reorder floating-point operations\n");
9920     Hints.emitRemarkWithHints();
9921     return false;
9922   }
9923 
9924   // Use the cost model.
9925   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9926                                 F, &Hints, IAI);
9927   // Use the planner for vectorization.
9928   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9929                                ORE);
9930 
9931   // Get user vectorization factor and interleave count.
9932   ElementCount UserVF = Hints.getWidth();
9933   unsigned UserIC = Hints.getInterleave();
9934 
9935   // Plan how to best vectorize, return the best VF and its cost.
9936   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9937 
9938   VectorizationFactor VF = VectorizationFactor::Disabled();
9939   unsigned IC = 1;
9940 
9941   bool AddBranchWeights =
9942       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9943   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9944                            F->getParent()->getDataLayout(), AddBranchWeights);
9945   if (MaybeVF) {
9946     VF = *MaybeVF;
9947     // Select the interleave count.
9948     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9949 
9950     unsigned SelectedIC = std::max(IC, UserIC);
9951     //  Optimistically generate runtime checks if they are needed. Drop them if
9952     //  they turn out to not be profitable.
9953     if (VF.Width.isVector() || SelectedIC > 1)
9954       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9955 
9956     // Check if it is profitable to vectorize with runtime checks.
9957     bool ForceVectorization =
9958         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9959     if (!ForceVectorization &&
9960         !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
9961                                     *PSE.getSE(), SEL)) {
9962       ORE->emit([&]() {
9963         return OptimizationRemarkAnalysisAliasing(
9964                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9965                    L->getHeader())
9966                << "loop not vectorized: cannot prove it is safe to reorder "
9967                   "memory operations";
9968       });
9969       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9970       Hints.emitRemarkWithHints();
9971       return false;
9972     }
9973   }
9974 
9975   // Identify the diagnostic messages that should be produced.
9976   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9977   bool VectorizeLoop = true, InterleaveLoop = true;
9978   if (VF.Width.isScalar()) {
9979     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9980     VecDiagMsg = std::make_pair(
9981         "VectorizationNotBeneficial",
9982         "the cost-model indicates that vectorization is not beneficial");
9983     VectorizeLoop = false;
9984   }
9985 
9986   if (!MaybeVF && UserIC > 1) {
9987     // Tell the user interleaving was avoided up-front, despite being explicitly
9988     // requested.
9989     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9990                          "interleaving should be avoided up front\n");
9991     IntDiagMsg = std::make_pair(
9992         "InterleavingAvoided",
9993         "Ignoring UserIC, because interleaving was avoided up front");
9994     InterleaveLoop = false;
9995   } else if (IC == 1 && UserIC <= 1) {
9996     // Tell the user interleaving is not beneficial.
9997     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9998     IntDiagMsg = std::make_pair(
9999         "InterleavingNotBeneficial",
10000         "the cost-model indicates that interleaving is not beneficial");
10001     InterleaveLoop = false;
10002     if (UserIC == 1) {
10003       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10004       IntDiagMsg.second +=
10005           " and is explicitly disabled or interleave count is set to 1";
10006     }
10007   } else if (IC > 1 && UserIC == 1) {
10008     // Tell the user interleaving is beneficial, but it explicitly disabled.
10009     LLVM_DEBUG(
10010         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10011     IntDiagMsg = std::make_pair(
10012         "InterleavingBeneficialButDisabled",
10013         "the cost-model indicates that interleaving is beneficial "
10014         "but is explicitly disabled or interleave count is set to 1");
10015     InterleaveLoop = false;
10016   }
10017 
10018   // Override IC if user provided an interleave count.
10019   IC = UserIC > 0 ? UserIC : IC;
10020 
10021   // Emit diagnostic messages, if any.
10022   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10023   if (!VectorizeLoop && !InterleaveLoop) {
10024     // Do not vectorize or interleaving the loop.
10025     ORE->emit([&]() {
10026       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10027                                       L->getStartLoc(), L->getHeader())
10028              << VecDiagMsg.second;
10029     });
10030     ORE->emit([&]() {
10031       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10032                                       L->getStartLoc(), L->getHeader())
10033              << IntDiagMsg.second;
10034     });
10035     return false;
10036   } else if (!VectorizeLoop && InterleaveLoop) {
10037     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10038     ORE->emit([&]() {
10039       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10040                                         L->getStartLoc(), L->getHeader())
10041              << VecDiagMsg.second;
10042     });
10043   } else if (VectorizeLoop && !InterleaveLoop) {
10044     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10045                       << ") in " << DebugLocStr << '\n');
10046     ORE->emit([&]() {
10047       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10048                                         L->getStartLoc(), L->getHeader())
10049              << IntDiagMsg.second;
10050     });
10051   } else if (VectorizeLoop && InterleaveLoop) {
10052     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10053                       << ") in " << DebugLocStr << '\n');
10054     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10055   }
10056 
10057   bool DisableRuntimeUnroll = false;
10058   MDNode *OrigLoopID = L->getLoopID();
10059   {
10060     using namespace ore;
10061     if (!VectorizeLoop) {
10062       assert(IC > 1 && "interleave count should not be 1 or 0");
10063       // If we decided that it is not legal to vectorize the loop, then
10064       // interleave it.
10065       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10066                                  &CM, BFI, PSI, Checks);
10067 
10068       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10069       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10070 
10071       ORE->emit([&]() {
10072         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10073                                   L->getHeader())
10074                << "interleaved loop (interleaved count: "
10075                << NV("InterleaveCount", IC) << ")";
10076       });
10077     } else {
10078       // If we decided that it is *legal* to vectorize the loop, then do it.
10079 
10080       // Consider vectorizing the epilogue too if it's profitable.
10081       VectorizationFactor EpilogueVF =
10082           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10083       if (EpilogueVF.Width.isVector()) {
10084 
10085         // The first pass vectorizes the main loop and creates a scalar epilogue
10086         // to be vectorized by executing the plan (potentially with a different
10087         // factor) again shortly afterwards.
10088         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10089         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10090                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10091 
10092         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10093         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10094             EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
10095         ++LoopsVectorized;
10096 
10097         // Second pass vectorizes the epilogue and adjusts the control flow
10098         // edges from the first pass.
10099         EPI.MainLoopVF = EPI.EpilogueVF;
10100         EPI.MainLoopUF = EPI.EpilogueUF;
10101         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10102                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10103                                                  Checks);
10104 
10105         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10106         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10107         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10108         Header->setName("vec.epilog.vector.body");
10109 
10110         // Re-use the trip count and steps expanded for the main loop, as
10111         // skeleton creation needs it as a value that dominates both the scalar
10112         // and vector epilogue loops
10113         // TODO: This is a workaround needed for epilogue vectorization and it
10114         // should be removed once induction resume value creation is done
10115         // directly in VPlan.
10116         EpilogILV.setTripCount(MainILV.getTripCount());
10117         for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10118           auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10119           auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
10120               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10121           ExpandR->replaceAllUsesWith(ExpandedVal);
10122           ExpandR->eraseFromParent();
10123         }
10124 
10125         // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10126         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10127         // before vectorizing the epilogue loop.
10128         for (VPRecipeBase &R : Header->phis()) {
10129           if (isa<VPCanonicalIVPHIRecipe>(&R))
10130             continue;
10131 
10132           Value *ResumeV = nullptr;
10133           // TODO: Move setting of resume values to prepareToExecute.
10134           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10135             ResumeV = ReductionResumeValues
10136                           .find(&ReductionPhi->getRecurrenceDescriptor())
10137                           ->second;
10138           } else {
10139             // Create induction resume values for both widened pointer and
10140             // integer/fp inductions and update the start value of the induction
10141             // recipes to use the resume value.
10142             PHINode *IndPhi = nullptr;
10143             const InductionDescriptor *ID;
10144             if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10145               IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10146               ID = &Ind->getInductionDescriptor();
10147             } else {
10148               auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10149               IndPhi = WidenInd->getPHINode();
10150               ID = &WidenInd->getInductionDescriptor();
10151             }
10152 
10153             ResumeV = MainILV.createInductionResumeValue(
10154                 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10155                 {EPI.MainLoopIterationCountCheck});
10156           }
10157           assert(ResumeV && "Must have a resume value");
10158           VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
10159           cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10160         }
10161 
10162         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10163                         DT, true, &ExpandedSCEVs);
10164         ++LoopsEpilogueVectorized;
10165 
10166         if (!MainILV.areSafetyChecksAdded())
10167           DisableRuntimeUnroll = true;
10168       } else {
10169         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10170                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10171                                PSI, Checks);
10172 
10173         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10174         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10175         ++LoopsVectorized;
10176 
10177         // Add metadata to disable runtime unrolling a scalar loop when there
10178         // are no runtime checks about strides and memory. A scalar loop that is
10179         // rarely used is not worth unrolling.
10180         if (!LB.areSafetyChecksAdded())
10181           DisableRuntimeUnroll = true;
10182       }
10183       // Report the vectorization decision.
10184       reportVectorization(ORE, L, VF, IC);
10185     }
10186 
10187     if (ORE->allowExtraAnalysis(LV_NAME))
10188       checkMixedPrecision(L, ORE);
10189   }
10190 
10191   std::optional<MDNode *> RemainderLoopID =
10192       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10193                                       LLVMLoopVectorizeFollowupEpilogue});
10194   if (RemainderLoopID) {
10195     L->setLoopID(*RemainderLoopID);
10196   } else {
10197     if (DisableRuntimeUnroll)
10198       AddRuntimeUnrollDisableMetaData(L);
10199 
10200     // Mark the loop as already vectorized to avoid vectorizing again.
10201     Hints.setAlreadyVectorized();
10202   }
10203 
10204   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10205   return true;
10206 }
10207 
10208 LoopVectorizeResult LoopVectorizePass::runImpl(
10209     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10210     DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10211     DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10212     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10213   SE = &SE_;
10214   LI = &LI_;
10215   TTI = &TTI_;
10216   DT = &DT_;
10217   BFI = BFI_;
10218   TLI = TLI_;
10219   AC = &AC_;
10220   LAIs = &LAIs_;
10221   DB = &DB_;
10222   ORE = &ORE_;
10223   PSI = PSI_;
10224 
10225   // Don't attempt if
10226   // 1. the target claims to have no vector registers, and
10227   // 2. interleaving won't help ILP.
10228   //
10229   // The second condition is necessary because, even if the target has no
10230   // vector registers, loop vectorization may still enable scalar
10231   // interleaving.
10232   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10233       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10234     return LoopVectorizeResult(false, false);
10235 
10236   bool Changed = false, CFGChanged = false;
10237 
10238   // The vectorizer requires loops to be in simplified form.
10239   // Since simplification may add new inner loops, it has to run before the
10240   // legality and profitability checks. This means running the loop vectorizer
10241   // will simplify all loops, regardless of whether anything end up being
10242   // vectorized.
10243   for (const auto &L : *LI)
10244     Changed |= CFGChanged |=
10245         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10246 
10247   // Build up a worklist of inner-loops to vectorize. This is necessary as
10248   // the act of vectorizing or partially unrolling a loop creates new loops
10249   // and can invalidate iterators across the loops.
10250   SmallVector<Loop *, 8> Worklist;
10251 
10252   for (Loop *L : *LI)
10253     collectSupportedLoops(*L, LI, ORE, Worklist);
10254 
10255   LoopsAnalyzed += Worklist.size();
10256 
10257   // Now walk the identified inner loops.
10258   while (!Worklist.empty()) {
10259     Loop *L = Worklist.pop_back_val();
10260 
10261     // For the inner loops we actually process, form LCSSA to simplify the
10262     // transform.
10263     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10264 
10265     Changed |= CFGChanged |= processLoop(L);
10266 
10267     if (Changed) {
10268       LAIs->clear();
10269 
10270 #ifndef NDEBUG
10271       if (VerifySCEV)
10272         SE->verify();
10273 #endif
10274     }
10275   }
10276 
10277   // Process each loop nest in the function.
10278   return LoopVectorizeResult(Changed, CFGChanged);
10279 }
10280 
10281 PreservedAnalyses LoopVectorizePass::run(Function &F,
10282                                          FunctionAnalysisManager &AM) {
10283     auto &LI = AM.getResult<LoopAnalysis>(F);
10284     // There are no loops in the function. Return before computing other expensive
10285     // analyses.
10286     if (LI.empty())
10287       return PreservedAnalyses::all();
10288     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10289     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10290     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10291     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10292     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10293     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10294     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10295 
10296     LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10297     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10298     ProfileSummaryInfo *PSI =
10299         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10300     BlockFrequencyInfo *BFI = nullptr;
10301     if (PSI && PSI->hasProfileSummary())
10302       BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10303     LoopVectorizeResult Result =
10304         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10305     if (!Result.MadeAnyChange)
10306       return PreservedAnalyses::all();
10307     PreservedAnalyses PA;
10308 
10309     if (isAssignmentTrackingEnabled(*F.getParent())) {
10310       for (auto &BB : F)
10311         RemoveRedundantDbgInstrs(&BB);
10312     }
10313 
10314     // We currently do not preserve loopinfo/dominator analyses with outer loop
10315     // vectorization. Until this is addressed, mark these analyses as preserved
10316     // only for non-VPlan-native path.
10317     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10318     if (!EnableVPlanNativePath) {
10319       PA.preserve<LoopAnalysis>();
10320       PA.preserve<DominatorTreeAnalysis>();
10321       PA.preserve<ScalarEvolutionAnalysis>();
10322     }
10323 
10324     if (Result.MadeCFGChange) {
10325       // Making CFG changes likely means a loop got vectorized. Indicate that
10326       // extra simplification passes should be run.
10327       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10328       // be run if runtime checks have been added.
10329       AM.getResult<ShouldRunExtraVectorPasses>(F);
10330       PA.preserve<ShouldRunExtraVectorPasses>();
10331     } else {
10332       PA.preserveSet<CFGAnalyses>();
10333     }
10334     return PA;
10335 }
10336 
10337 void LoopVectorizePass::printPipeline(
10338     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10339   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10340       OS, MapClassName2PassName);
10341 
10342   OS << '<';
10343   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10344   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10345   OS << '>';
10346 }
10347